topazscripts 1.5

This commit is contained in:
some_updates 2010-01-20 12:13:31 +00:00 committed by Apprentice Alf
parent c1e5943471
commit a1fec0b54d
7 changed files with 186 additions and 60 deletions

View File

@ -0,0 +1,20 @@
Changes in version 1.5
- completely reworked generation of styles to use actual page heights and widths
- added new script getpagedim.py to support the above
- style names with underscores in them are now properly paired with their base class
- fixed hanging indents that did not ever set a left margin
- added support for a number of not previously known region types
- added support for a previously unknown snippet - <empty></empty>
- corrected a bug that caused unknown regions to abort the program
- added code to make the handling of unknown regions better in general
- corrected a bug that caused the last link on a page to be missing (if it was the last thing on the page)
Changes in version 1.3
- font generation by gensvg.py is now greatly improved with support for contour points added
- support for more region types
- support for inline images in paragraphs or text fields (ie. initial graphics for the first letter of a word)
- greatly improved dtd information used for the xml to prevent parsing mistakes
Version 1.0
- initial release

View File

@ -209,6 +209,8 @@ class PageParser(object):
'wordStems' : (0, 'number', 1, 1), 'wordStems' : (0, 'number', 1, 1),
'wordStems.stemID' : (1, 'number', 0, 0), 'wordStems.stemID' : (1, 'number', 0, 0),
'empty' : (1, 'snippets', 1, 0),
'page' : (1, 'snippets', 1, 0), 'page' : (1, 'snippets', 1, 0),
'page.pageid' : (1, 'scalar_text', 0, 0), 'page.pageid' : (1, 'scalar_text', 0, 0),
'page.pagelabel' : (1, 'scalar_text', 0, 0), 'page.pagelabel' : (1, 'scalar_text', 0, 0),
@ -750,6 +752,7 @@ def main(argv):
# read in the string table dictionary # read in the string table dictionary
dict = Dictionary(dictFile) dict = Dictionary(dictFile)
# dict.dumpDict()
# create a page parser # create a page parser
pp = PageParser(pageFile, dict, debug, flat_xml) pp = PageParser(pageFile, dict, debug, flat_xml)

View File

@ -90,20 +90,23 @@ class DocParser(object):
# class names are an issue given topaz may start them with numerals (not allowed), # class names are an issue given topaz may start them with numerals (not allowed),
# use a mix of cases (which cause some browsers problems), and actually # use a mix of cases (which cause some browsers problems), and actually
# attach numbers after "_reclustered*" to the end to deal with reflow issues # attach numbers after "_reclustered*" to the end to deal classeses that inherit
# but then not actually provide all of these _reclustereed classes in the stylesheet! # from a base class (but then not actually provide all of these _reclustereed
# classes in the stylesheet!
# so we clean this up by lowercasing, prepend 'cl_', and if not in the class # so we clean this up by lowercasing, prepend 'cl_', and getting any baseclass
# list from the stylesheet, trying once more with "_reclustered*" removed # that exists in the stylesheet first, and then adding this specific class
# if still not in stylesheet, let it pass as is # after
classres = ''
pclass = pclass.lower() pclass = pclass.lower()
pclass = 'cl_' + pclass pclass = 'cl-' + pclass
if pclass not in self.classList: p = pclass.find('_')
p = pclass.find('_reclustered')
if p > 0 : if p > 0 :
baseclass = pclass[0:p] baseclass = pclass[0:p]
if baseclass in self.classList: if baseclass in self.classList:
pclass = baseclass classres += baseclass + ' '
classres += pclass
pclass = classres
# build up a description of the paragraph in result and return it # build up a description of the paragraph in result and return it
# first check for the basic - all words paragraph # first check for the basic - all words paragraph
@ -123,6 +126,12 @@ class DocParser(object):
line = start + 1 line = start + 1
word_class = '' word_class = ''
# if end is -1 then we must search to end of document
if end == -1 :
docList = self.flatdoc
cnt = len(docList)
end = cnt
while (line < end) : while (line < end) :
(name, argres) = self.lineinDoc(line) (name, argres) = self.lineinDoc(line)
@ -139,7 +148,8 @@ class DocParser(object):
elif name.endswith('word.class'): elif name.endswith('word.class'):
(cname, space) = argres.split('-',1) (cname, space) = argres.split('-',1)
if cname == 'spaceafter': if space == '' : space = '0'
if (cname == 'spaceafter') and (int(space) > 0) :
word_class = 'sa' word_class = 'sa'
elif name.endswith('word.img.src'): elif name.endswith('word.img.src'):
@ -193,7 +203,8 @@ class DocParser(object):
link = self.link_id[num] link = self.link_id[num]
if (link > 0): if (link > 0):
title = self.link_title[link-1] title = self.link_title[link-1]
if title == "": title='_link_' if (title == "") or (parares.rfind(title) < 0):
title='_link_'
ptarget = self.link_page[link-1] - 1 ptarget = self.link_page[link-1] - 1
linkhtml = '<a href="#page%04d">' % ptarget linkhtml = '<a href="#page%04d">' % ptarget
linkhtml += title + '</a>' linkhtml += title + '</a>'
@ -326,7 +337,7 @@ class DocParser(object):
htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype) htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
htmlpage += '</' + tag + '>' htmlpage += '</' + tag + '>'
elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') : elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') or (regtype == 'listitem') :
ptype = 'full' ptype = 'full'
# check to see if this is a continution from the previous page # check to see if this is a continution from the previous page
if (len(self.parastems_stemid) > 0): if (len(self.parastems_stemid) > 0):
@ -348,7 +359,6 @@ class DocParser(object):
else : else :
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
elif (regtype == 'tocentry') : elif (regtype == 'tocentry') :
ptype = 'full' ptype = 'full'
# check to see if this is a continution from the previous page # check to see if this is a continution from the previous page
@ -363,7 +373,7 @@ class DocParser(object):
(pclass, pdesc) = self.getParaDescription(start,end) (pclass, pdesc) = self.getParaDescription(start,end)
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
elif regtype == 'synth_fcvr.center' : elif (regtype == 'synth_fcvr.center') or (regtype == 'synth_text.center'):
if not anchorSet: if not anchorSet:
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n' htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
anchorSet = True anchorSet = True
@ -373,7 +383,9 @@ class DocParser(object):
else : else :
print 'Warning: Unknown region type', regtype print 'Warning: Unknown region type', regtype
print 'Treating this like a "fixed" region' (pos, temp) = self.findinDoc('paragraph',start,end)
if temp:
print 'Treating this like a "text" region'
regtype = 'fixed' regtype = 'fixed'
ptype = 'full' ptype = 'full'
# check to see if this is a continution from the previous page # check to see if this is a continution from the previous page
@ -384,7 +396,7 @@ class DocParser(object):
if not anchorSet: if not anchorSet:
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n' htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
anchorSet = True anchorSet = True
(pclass, desc) = self.getParaDescription(start,end) (pclass, pdesc) = self.getParaDescription(start,end)
if ptype == 'full' : if ptype == 'full' :
tag = 'p' tag = 'p'
if pclass[3:6] == 'h1-' : tag = 'h4' if pclass[3:6] == 'h1-' : tag = 'h4'
@ -395,8 +407,14 @@ class DocParser(object):
htmlpage += '</' + tag + '>' htmlpage += '</' + tag + '>'
else : else :
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
else :
print 'Treating this like a "image" region'
if not anchorSet:
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
anchorSet = True
(pos, simgsrc) = self.findinDoc('img.src',start,end)
if simgsrc:
htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
if len(self.paracont_stemid) > 0 : if len(self.paracont_stemid) > 0 :
if htmlpage[-4:] == '</p>': if htmlpage[-4:] == '</p>':

View File

@ -8,7 +8,7 @@ import convert2xml
import flatxml2html import flatxml2html
import decode_meta import decode_meta
import stylexml2css import stylexml2css
import getpagedim
def usage(): def usage():
print 'Usage: ' print 'Usage: '
@ -86,6 +86,7 @@ def main(argv):
htmlstr += '<head>\n' htmlstr += '<head>\n'
# process metadata and retrieve fontSize info
print ' ', 'metadata0000.dat' print ' ', 'metadata0000.dat'
fname = os.path.join(bookDir,'metadata0000.dat') fname = os.path.join(bookDir,'metadata0000.dat')
xname = os.path.join(bookDir, 'metadata.txt') xname = os.path.join(bookDir, 'metadata.txt')
@ -100,12 +101,27 @@ def main(argv):
if 'fontSize' in meta_array: if 'fontSize' in meta_array:
fontsize = meta_array['fontSize'] fontsize = meta_array['fontSize']
# also get the size of a normal text page
spage = '1'
if 'firstTextPage' in meta_array:
spage = meta_array['firstTextPage']
pnum = int(spage)
# get page height and width from first text page for use in stylesheet scaling
pname = 'page%04d.dat' % pnum
fname = os.path.join(pageDir,pname)
flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
(ph, pw) = getpagedim.getPageDim(flat_xml)
if (ph == '-1') : ph = 11000
if (pw == '-1') : pw = 8500
# now build up the style sheet
print ' ', 'other0000.dat' print ' ', 'other0000.dat'
fname = os.path.join(bookDir,'other0000.dat') fname = os.path.join(bookDir,'other0000.dat')
xname = os.path.join(bookDir, 'style.css') xname = os.path.join(bookDir, 'style.css')
xmlstr = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname) xmlstr = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
htmlstr += '<style>\n' htmlstr += '<style>\n'
cssstr , classlst = stylexml2css.convert2CSS(xmlstr, fontsize) cssstr , classlst = stylexml2css.convert2CSS(xmlstr, fontsize, ph, pw)
file(xname, 'wb').write(cssstr) file(xname, 'wb').write(cssstr)
htmlstr += cssstr htmlstr += cssstr
htmlstr += '</style>\n' htmlstr += '</style>\n'

View File

@ -0,0 +1,53 @@
#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
from __future__ import with_statement
import csv
import sys
import os
import getopt
from struct import pack
from struct import unpack
class DocParser(object):
def __init__(self, flatxml):
self.flatdoc = flatxml.split('\n')
# find tag if within pos to end inclusive
def findinDoc(self, tagpath, pos, end) :
result = None
docList = self.flatdoc
cnt = len(docList)
if end == -1 :
end = cnt
else:
end = min(cnt,end)
foundat = -1
for j in xrange(pos, end):
item = docList[j]
if item.find('=') >= 0:
(name, argres) = item.split('=')
else :
name = item
argres = ''
if name.endswith(tagpath) :
result = argres
foundat = j
break
return foundat, result
def process(self):
(pos, sph) = self.findinDoc('page.h',0,-1)
(pos, spw) = self.findinDoc('page.w',0,-1)
if (sph == None): sph = '-1'
if (spw == None): spw = '-1'
return sph, spw
def getPageDim(flatxml):
# create a document parser
dp = DocParser(flatxml)
(ph, pw) = dp.process()
return ph, pw

View File

@ -3,7 +3,8 @@ Contributors:
clarknova - for all of the svg and glyph generation and many other bug fixes and improvements clarknova - for all of the svg and glyph generation and many other bug fixes and improvements
skindle - for figuing out the general case for the mode loops skindle - for figuing out the general case for the mode loops
some updates - for conversion to xml, basic html some updates - for conversion to xml, basic html
DiapDealer - for extensive testing and feeback DiapDealer - for extensive testing and feedback
stewball - for extensive testing and feedback
and others for posting, feedback and testing and others for posting, feedback and testing
@ -23,12 +24,13 @@ decode_meta.py - converts metadata0000.dat to human readable text (for the most
convert2xml.py - converts page*.dat, other*.dat, and glyphs*.dat files to pseudo xml descriptions convert2xml.py - converts page*.dat, other*.dat, and glyphs*.dat files to pseudo xml descriptions
flatxml2html.py - converts a "flattened" xml description to html using the ocrtext flatxml2html.py - converts a "flattened" xml description to html using the ocrtext
stylexml2css.py - converts stylesheet "flattened" xml into css (as best it can) stylexml2css.py - converts stylesheet "flattened" xml into css (as best it can)
getpagedim.py - reads page0000.dat to get the book height and width parameters
genxml.py - main program to convert everything to xml genxml.py - main program to convert everything to xml
genhtml.py - main program to generate "book.html" genhtml.py - main program to generate "book.html"
gensvg.py - (author: clarknova) main program to create an svg grpahic of each page gensvg.py - (author: clarknova) main program to create an svg grpahic of each page
Please note, gensvg.py, genhtml.py, and genxml.py import and use Please note, gensvg.py, genhtml.py, and genxml.py import and use
decode_meta.py, convert2xml.py, flatxml2html.py, and stylexml2css.py decode_meta.py, convert2xml.py, flatxml2html.py, getpagedim.py and stylexml2css.py
so please keep all of these python scripts together in the same place. so please keep all of these python scripts together in the same place.

View File

@ -11,9 +11,11 @@ from struct import unpack
class DocParser(object): class DocParser(object):
def __init__(self, flatxml, fontsize): def __init__(self, flatxml, fontsize, ph, pw):
self.flatdoc = flatxml.split('\n') self.flatdoc = flatxml.split('\n')
self.fontsize = int(fontsize) self.fontsize = int(fontsize)
self.ph = int(ph) * 1.0
self.pw = int(pw) * 1.0
stags = { stags = {
'paragraph' : 'p', 'paragraph' : 'p',
@ -106,14 +108,14 @@ class DocParser(object):
# get the style class # get the style class
(pos, sclass) = self.findinDoc('style.class',start,end) (pos, sclass) = self.findinDoc('style.class',start,end)
if sclass != None: if sclass != None:
sclass = '.cl_' + sclass.lower() sclass = '.cl-' + sclass.lower()
else : else :
sclass = '' sclass = ''
# check for any "after class" specifiers # check for any "after class" specifiers
(pos, aftclass) = self.findinDoc('style._after_class',start,end) (pos, aftclass) = self.findinDoc('style._after_class',start,end)
if aftclass != None: if aftclass != None:
aftclass = '.cl_' + aftclass.lower() aftclass = '.cl-' + aftclass.lower()
else : else :
aftclass = '' aftclass = ''
@ -121,8 +123,8 @@ class DocParser(object):
while True : while True :
(pos, attr) = self.findinDoc('style.rule.attr', start, end) (pos1, attr) = self.findinDoc('style.rule.attr', start, end)
(pos, val) = self.findinDoc('style.rule.value', start, end) (pos2, val) = self.findinDoc('style.rule.value', start, end)
if attr == None : break if attr == None : break
@ -135,29 +137,35 @@ class DocParser(object):
# handle value based attributes # handle value based attributes
if attr in self.attr_val_map : if attr in self.attr_val_map :
name = self.attr_val_map[attr] name = self.attr_val_map[attr]
scale = self.fontsize if attr in ('margin-bottom', 'margin-top', 'space-after') :
if attr == 'line-space': scale = scale * 1.41 scale = self.ph
elif attr in ('margin-right', 'indent', 'margin-left', 'hang') :
scale = self.pw
elif attr == 'line-space':
scale = self.fontsize * 2.0
if not ((attr == 'hang') and (int(val) == 0)) : if not ((attr == 'hang') and (int(val) == 0)) :
ems = int(val)/scale pv = float(val)/scale
cssargs[attr] = (self.attr_val_map[attr], ems) cssargs[attr] = (self.attr_val_map[attr], pv)
keep = True keep = True
start = pos + 1 start = max(pos1, pos2) + 1
# disable all of the after class tags until I figure out how to handle them # disable all of the after class tags until I figure out how to handle them
if aftclass != "" : keep = False if aftclass != "" : keep = False
if keep : if keep :
# make sure line-space does not go below 1em # make sure line-space does not go below 100% or above 300% since
# it can be wacky in some styles
if 'line-space' in cssargs: if 'line-space' in cssargs:
seg = cssargs['line-space'][0] seg = cssargs['line-space'][0]
val = cssargs['line-space'][1] val = cssargs['line-space'][1]
if val < 1.0: val = 1.0 if val < 1.0: val = 1.0
if val > 3.0: val = 3.0
del cssargs['line-space'] del cssargs['line-space']
cssargs['line-space'] = (self.attr_val_map['line-space'], val) cssargs['line-space'] = (self.attr_val_map['line-space'], val)
# handle modifications for css style hanging indents # handle modifications for css style hanging indents
if 'hang' in cssargs: if 'hang' in cssargs:
hseg = cssargs['hang'][0] hseg = cssargs['hang'][0]
@ -166,9 +174,11 @@ class DocParser(object):
cssargs['hang'] = (self.attr_val_map['hang'], -hval) cssargs['hang'] = (self.attr_val_map['hang'], -hval)
mval = 0 mval = 0
mseg = 'margin-left: ' mseg = 'margin-left: '
mval = hval
if 'margin-left' in cssargs: if 'margin-left' in cssargs:
mseg = cssargs['margin-left'][0] mseg = cssargs['margin-left'][0]
mval = cssargs['margin-left'][1] mval = cssargs['margin-left'][1]
if mval < 0: mval = 0
mval = hval + mval mval = hval + mval
cssargs['margin-left'] = (mseg, mval) cssargs['margin-left'] = (mseg, mval)
if 'indent' in cssargs: if 'indent' in cssargs:
@ -181,7 +191,7 @@ class DocParser(object):
if mval == '': if mval == '':
cssline += mseg + ' ' cssline += mseg + ' '
else : else :
aseg = mseg + '%.1fem;' % mval aseg = mseg + '%.1f%%;' % (mval * 100.0)
cssline += aseg + ' ' cssline += aseg + ' '
cssline += '}' cssline += '}'
@ -213,10 +223,14 @@ class DocParser(object):
def convert2CSS(flatxml, fontsize): def convert2CSS(flatxml, fontsize, ph, pw):
print ' ', 'Using font size:',fontsize
print ' ', 'Using page height:', ph
print ' ', 'Using page width:', pw
# create a document parser # create a document parser
dp = DocParser(flatxml, fontsize) dp = DocParser(flatxml, fontsize, ph, pw)
csspage = dp.process() csspage = dp.process()