diff --git a/Topaz_Tools/lib/changes.txt b/Topaz_Tools/lib/changes.txt index 83910c0..125a869 100644 --- a/Topaz_Tools/lib/changes.txt +++ b/Topaz_Tools/lib/changes.txt @@ -1,17 +1,29 @@ -Changes in version 1.8 +Changes in version 2.0 + - gensvg.py now accepts two options + -x : output browseable XHTML+SVG pages (default) + -r : output raw SVG images (useful for later conversion to pdf) + + - flatxml2html.py now understands page.groups of type graphic + and handles vertical regions as svg images + + - genhtml.py now accepts an option + --fixed-image : which will force the conversion + of all fixed regions to svg images + + - minor bug fixes and html conversion improvements + + +Changes in version 1.8 - gensvg.py now builds wonderful xhtml pages with embedded svg that can be easily paged through as if reading a book! (tested in Safari for Mac and Win and Firefox) (requires javascript to be enabled) - - genhtml.py now REQUIRES that gensvg.py be run FIRST this allows create of images on the fly from glyphs - - genhtml.py now automatically makes tables of words into svg based images and will handle glyph based ornate first letters of words - - cmbtc_dump_mac_linux.py has been renamed to be cmbtc_dump_nonK4PC.py to make it clearer when it needs to be used diff --git a/Topaz_Tools/lib/cmbtc_dump.py b/Topaz_Tools/lib/cmbtc_dump.py index ac7e33c..de0ecf7 100644 --- a/Topaz_Tools/lib/cmbtc_dump.py +++ b/Topaz_Tools/lib/cmbtc_dump.py @@ -1,5 +1,5 @@ #! /usr/bin/python -# For use in Topaz Scripts version 1.8 +# For use in Topaz Scripts version 2.0 """ diff --git a/Topaz_Tools/lib/cmbtc_dump_nonK4PC.py b/Topaz_Tools/lib/cmbtc_dump_nonK4PC.py index ed7ff87..5e43ae6 100644 --- a/Topaz_Tools/lib/cmbtc_dump_nonK4PC.py +++ b/Topaz_Tools/lib/cmbtc_dump_nonK4PC.py @@ -1,5 +1,5 @@ #! /usr/bin/python -# For use with Topaz Scripts Version 1.8 +# For use with Topaz Scripts Version 2.0 from __future__ import with_statement diff --git a/Topaz_Tools/lib/convert2xml.py b/Topaz_Tools/lib/convert2xml.py index 12ca934..4e84184 100644 --- a/Topaz_Tools/lib/convert2xml.py +++ b/Topaz_Tools/lib/convert2xml.py @@ -1,6 +1,6 @@ #! /usr/bin/python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab -# For use with Topaz Scripts Version 1.8 +# For use with Topaz Scripts Version 2.0 from __future__ import with_statement import csv diff --git a/Topaz_Tools/lib/decode_meta.py b/Topaz_Tools/lib/decode_meta.py index ba831ec..9f58a53 100644 --- a/Topaz_Tools/lib/decode_meta.py +++ b/Topaz_Tools/lib/decode_meta.py @@ -1,6 +1,6 @@ #! /usr/bin/python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab -# For use with Topaz Scripts Version 1.8 +# For use with Topaz Scripts Version 2.0 from __future__ import with_statement import csv diff --git a/Topaz_Tools/lib/flatxml2html.py b/Topaz_Tools/lib/flatxml2html.py index 4182502..eaeeabe 100644 --- a/Topaz_Tools/lib/flatxml2html.py +++ b/Topaz_Tools/lib/flatxml2html.py @@ -1,6 +1,6 @@ #! /usr/bin/python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab -# For use with Topaz Scripts Version 1.8 +# For use with Topaz Scripts Version 2.0 from __future__ import with_statement import csv @@ -13,7 +13,7 @@ from struct import unpack class DocParser(object): - def __init__(self, flatxml, classlst, fileid, bookDir): + def __init__(self, flatxml, classlst, fileid, bookDir, fixedimage): self.id = os.path.basename(fileid).replace('.dat','') self.svgcount = 0 self.docList = flatxml.split('\n') @@ -28,6 +28,7 @@ class DocParser(object): # remove the leading period from the css name cname = pclass[1:] self.classList[cname] = True + self.fixedimage = fixedimage self.ocrtext = [] self.link_id = [] self.link_title = [] @@ -63,7 +64,7 @@ class DocParser(object): imgname = self.id + '_%04d.svg' % self.svgcount imgfile = os.path.join(imgDir,imgname) - # build hash table of glyph paths keyed by glyph id + # build hashtable of glyph paths keyed by glyph id if self.numPaths == 0: gfile = open(glyfile, 'r') while True: @@ -194,15 +195,9 @@ class DocParser(object): return argres - - # build a description of the paragraph - def getParaDescription(self, start, end): - - result = [] - - # paragraph - (pos, pclass) = self.findinDoc('paragraph.class',start,end) - + # get the class + def getClass(self, pclass): + nclass = pclass # class names are an issue given topaz may start them with numerals (not allowed), # use a mix of cases (which cause some browsers problems), and actually # attach numbers after "_reclustered*" to the end to deal classeses that inherit @@ -212,17 +207,85 @@ class DocParser(object): # so we clean this up by lowercasing, prepend 'cl-', and getting any baseclass # that exists in the stylesheet first, and then adding this specific class # after - if pclass != None : + if nclass != None : classres = '' - pclass = pclass.lower() - pclass = 'cl-' + pclass - p = pclass.find('_') - if p > 0 : - baseclass = pclass[0:p] - if baseclass in self.classList: - classres += baseclass + ' ' - classres += pclass - pclass = classres + nclass = nclass.lower() + nclass = 'cl-' + nclass + baseclass = '' + # graphic is the base class for captions + if nclass.find('cl-cap-') >=0 : + classres = 'graphic' + ' ' + else : + # strip to find baseclass + p = nclass.find('_') + if p > 0 : + baseclass = nclass[0:p] + if baseclass in self.classList: + classres += baseclass + ' ' + classres += nclass + nclass = classres + return nclass + + + # develop a sorted description of the starting positions of + # groups and regions on the page, as well as the page type + def PageDescription(self): + + def compare(x, y): + (xtype, xval) = x + (ytype, yval) = y + if xval > yval: + return 1 + if xval == yval: + return 0 + return -1 + + result = [] + (pos, pagetype) = self.findinDoc('page.type',0,-1) + + groupList = self.posinDoc('page.group') + groupregionList = self.posinDoc('page.group.region') + pageregionList = self.posinDoc('page.region') + # integrate into one list + for j in groupList: + result.append(('grpbeg',j)) + for j in groupregionList: + result.append(('gregion',j)) + for j in pageregionList: + result.append(('pregion',j)) + result.sort(compare) + + # insert group end and page end indicators + inGroup = False + j = 0 + while True: + if j == len(result): break + rtype = result[j][0] + rval = result[j][1] + if not inGroup and (rtype == 'grpbeg') : + inGroup = True + j = j + 1 + elif inGroup and (rtype in ('grpbeg', 'pregion')): + result.insert(j,('grpend',rval)) + inGroup = False + else: + j = j + 1 + if inGroup: + result.append(('grpend',-1)) + result.append(('pageend', -1)) + return pagetype, result + + + + # build a description of the paragraph + def getParaDescription(self, start, end, regtype): + + result = [] + + # paragraph + (pos, pclass) = self.findinDoc('paragraph.class',start,end) + + pclass = self.getClass(pclass) # build up a description of the paragraph in result and return it # first check for the basic - all words paragraph @@ -231,13 +294,49 @@ class DocParser(object): if (sfirst != None) and (slast != None) : first = int(sfirst) last = int(slast) - for wordnum in xrange(first, last): - result.append(('ocr', wordnum)) + + makeImage = (regtype == 'vertical') or (regtype == 'table') + if self.fixedimage: + makeImage = makeImage or (regtype == 'fixed') + + if (pclass != None): + makeImage = makeImage or (pclass.find('.inverted') >= 0) + if self.fixedimage : + makeImage = makeImage or (pclass.find('cl-f-') >= 0) + + if not makeImage : + # standard all word paragraph + for wordnum in xrange(first, last): + result.append(('ocr', wordnum)) + return pclass, result + + # convert paragraph to svg image + # translate first and last word into first and last glyphs + # and generate inline image and include it + glyphList = [] + firstglyphList = self.getData('word.firstGlyph',0,-1) + gidList = self.getData('info.glyph.glyphID',0,-1) + firstGlyph = firstglyphList[first] + if last < len(firstglyphList): + lastGlyph = firstglyphList[last] + else : + lastGlyph = len(gidList) + for glyphnum in xrange(firstGlyph, lastGlyph): + glyphList.append(glyphnum) + # include any extratokens if they exist + (pos, sfg) = self.findinDoc('extratokens.firstGlyph',start,end) + (pos, slg) = self.findinDoc('extratokens.lastGlyph',start,end) + if (sfg != None) and (slg != None): + for glyphnum in xrange(int(sfg), int(slg)): + glyphList.append(glyphnum) + num = self.svgcount + self.glyphs_to_image(glyphList) + self.svgcount += 1 + result.append(('svg', num)) return pclass, result - # this type of paragrph may be made up of multiple _spans, inline - # word monograms (images) and words with semantic meaning - # and now a new type "span" versus the old "_span" + # this type of paragrph may be made up of multiple spans, inline + # word monograms (images), and words with semantic meaning, # plus glyphs used to form starting letter of first word # need to parse this type line by line @@ -252,6 +351,7 @@ class DocParser(object): (name, argres) = self.lineinDoc(line) + # handle both span and _span if name.endswith('span.firstWord') : first = int(argres) (name, argres) = self.lineinDoc(line+1) @@ -422,148 +522,78 @@ class DocParser(object): else: self.link_title.append('') - - # get page type - (pos, pagetype) = self.findinDoc('page.type',0,-1) - - - # generate a list of each region starting point - # each region has one paragraph,, or one image, or one chapterheading - - regionList= self.posinDoc('region') - regcnt = len(regionList) - regionList.append(-1) + # get a descriptions of the starting points of the regions + # and groups on the page + (pagetype, pageDesc) = self.PageDescription() + regcnt = len(pageDesc) - 1 anchorSet = False breakSet = False - - # process each region tag and convert what you can to html + inGroup = False + + # process each region on the page and convert what you can to html for j in xrange(regcnt): - start = regionList[j] - end = regionList[j+1] - - (pos, regtype) = self.findinDoc('region.type',start,end) + (etype, start) = pageDesc[j] + (ntype, end) = pageDesc[j+1] + # set anchor for link target on this page if not anchorSet and not first_para_continued: - htmlpage += '
\n' + htmlpage += '\n' anchorSet = True - if regtype == 'graphic' : - (pos, simgsrc) = self.findinDoc('img.src',start,end) - if simgsrc: - htmlpage += '' % int(simgsrc) + # handle groups of graphics with text captions + if (etype == 'grpbeg'): + (pos, grptype) = self.findinDoc('group.type', start, end) + if grptype != None: + if grptype == 'graphic': + gcstr = ' class="' + grptype + '"' + htmlpage += '