From a1fec0b54d5d5db2e2681d2dc333eb2d34919caf Mon Sep 17 00:00:00 2001
From: some_updates
Date: Wed, 20 Jan 2010 12:13:31 +0000
Subject: [PATCH] topazscripts 1.5
---
Topaz_Tools/lib/changes.txt | 20 +++++++
Topaz_Tools/lib/convert2xml.py | 5 +-
Topaz_Tools/lib/flatxml2html.py | 96 +++++++++++++++++++--------------
Topaz_Tools/lib/genhtml.py | 20 ++++++-
Topaz_Tools/lib/getpagedim.py | 53 ++++++++++++++++++
Topaz_Tools/lib/readme.txt | 6 ++-
Topaz_Tools/lib/stylexml2css.py | 46 ++++++++++------
7 files changed, 186 insertions(+), 60 deletions(-)
create mode 100644 Topaz_Tools/lib/changes.txt
create mode 100644 Topaz_Tools/lib/getpagedim.py
diff --git a/Topaz_Tools/lib/changes.txt b/Topaz_Tools/lib/changes.txt
new file mode 100644
index 0000000..cc2f00a
--- /dev/null
+++ b/Topaz_Tools/lib/changes.txt
@@ -0,0 +1,20 @@
+Changes in version 1.5
+ - completely reworked generation of styles to use actual page heights and widths
+ - added new script getpagedim.py to support the above
+ - style names with underscores in them are now properly paired with their base class
+ - fixed hanging indents that did not ever set a left margin
+ - added support for a number of not previously known region types
+ - added support for a previously unknown snippet -
+ - corrected a bug that caused unknown regions to abort the program
+ - added code to make the handling of unknown regions better in general
+ - corrected a bug that caused the last link on a page to be missing (if it was the last thing on the page)
+
+Changes in version 1.3
+ - font generation by gensvg.py is now greatly improved with support for contour points added
+ - support for more region types
+ - support for inline images in paragraphs or text fields (ie. initial graphics for the first letter of a word)
+ - greatly improved dtd information used for the xml to prevent parsing mistakes
+
+Version 1.0
+ - initial release
+
diff --git a/Topaz_Tools/lib/convert2xml.py b/Topaz_Tools/lib/convert2xml.py
index 4bec36f..07741a7 100644
--- a/Topaz_Tools/lib/convert2xml.py
+++ b/Topaz_Tools/lib/convert2xml.py
@@ -93,7 +93,7 @@ def convert(i):
for j in xrange(len(val)):
c = ord(val[j:j+1])
result += '%02x' % c
- return result
+ return result
@@ -209,6 +209,8 @@ class PageParser(object):
'wordStems' : (0, 'number', 1, 1),
'wordStems.stemID' : (1, 'number', 0, 0),
+ 'empty' : (1, 'snippets', 1, 0),
+
'page' : (1, 'snippets', 1, 0),
'page.pageid' : (1, 'scalar_text', 0, 0),
'page.pagelabel' : (1, 'scalar_text', 0, 0),
@@ -750,6 +752,7 @@ def main(argv):
# read in the string table dictionary
dict = Dictionary(dictFile)
+ # dict.dumpDict()
# create a page parser
pp = PageParser(pageFile, dict, debug, flat_xml)
diff --git a/Topaz_Tools/lib/flatxml2html.py b/Topaz_Tools/lib/flatxml2html.py
index f93318f..f2dd244 100644
--- a/Topaz_Tools/lib/flatxml2html.py
+++ b/Topaz_Tools/lib/flatxml2html.py
@@ -90,20 +90,23 @@ class DocParser(object):
# class names are an issue given topaz may start them with numerals (not allowed),
# use a mix of cases (which cause some browsers problems), and actually
- # attach numbers after "_reclustered*" to the end to deal with reflow issues
- # but then not actually provide all of these _reclustereed classes in the stylesheet!
+ # attach numbers after "_reclustered*" to the end to deal classeses that inherit
+ # from a base class (but then not actually provide all of these _reclustereed
+ # classes in the stylesheet!
- # so we clean this up by lowercasing, prepend 'cl_', and if not in the class
- # list from the stylesheet, trying once more with "_reclustered*" removed
- # if still not in stylesheet, let it pass as is
+ # so we clean this up by lowercasing, prepend 'cl_', and getting any baseclass
+ # that exists in the stylesheet first, and then adding this specific class
+ # after
+ classres = ''
pclass = pclass.lower()
- pclass = 'cl_' + pclass
- if pclass not in self.classList:
- p = pclass.find('_reclustered')
- if p > 0 :
- baseclass = pclass[0:p]
- if baseclass in self.classList:
- pclass = baseclass
+ pclass = 'cl-' + pclass
+ p = pclass.find('_')
+ if p > 0 :
+ baseclass = pclass[0:p]
+ if baseclass in self.classList:
+ classres += baseclass + ' '
+ classres += pclass
+ pclass = classres
# build up a description of the paragraph in result and return it
# first check for the basic - all words paragraph
@@ -123,6 +126,12 @@ class DocParser(object):
line = start + 1
word_class = ''
+ # if end is -1 then we must search to end of document
+ if end == -1 :
+ docList = self.flatdoc
+ cnt = len(docList)
+ end = cnt
+
while (line < end) :
(name, argres) = self.lineinDoc(line)
@@ -139,7 +148,8 @@ class DocParser(object):
elif name.endswith('word.class'):
(cname, space) = argres.split('-',1)
- if cname == 'spaceafter':
+ if space == '' : space = '0'
+ if (cname == 'spaceafter') and (int(space) > 0) :
word_class = 'sa'
elif name.endswith('word.img.src'):
@@ -166,7 +176,7 @@ class DocParser(object):
sep =''
br_lb = False
- if (regtype == 'fixed') or (regtype == 'chapterheading') :
+ if (regtype == 'fixed') or (regtype == 'chapterheading'):
br_lb = True
handle_links = False
@@ -193,7 +203,8 @@ class DocParser(object):
link = self.link_id[num]
if (link > 0):
title = self.link_title[link-1]
- if title == "": title='_link_'
+ if (title == "") or (parares.rfind(title) < 0):
+ title='_link_'
ptarget = self.link_page[link-1] - 1
linkhtml = '' % ptarget
linkhtml += title + ''
@@ -326,7 +337,7 @@ class DocParser(object):
htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
htmlpage += '' + tag + '>'
- elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') :
+ elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') or (regtype == 'listitem') :
ptype = 'full'
# check to see if this is a continution from the previous page
if (len(self.parastems_stemid) > 0):
@@ -348,7 +359,6 @@ class DocParser(object):
else :
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
-
elif (regtype == 'tocentry') :
ptype = 'full'
# check to see if this is a continution from the previous page
@@ -363,7 +373,7 @@ class DocParser(object):
(pclass, pdesc) = self.getParaDescription(start,end)
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
- elif regtype == 'synth_fcvr.center' :
+ elif (regtype == 'synth_fcvr.center') or (regtype == 'synth_text.center'):
if not anchorSet:
htmlpage += ' 
\n'
anchorSet = True
@@ -373,30 +383,38 @@ class DocParser(object):
else :
print 'Warning: Unknown region type', regtype
- print 'Treating this like a "fixed" region'
- regtype = 'fixed'
- ptype = 'full'
- # check to see if this is a continution from the previous page
- if (len(self.parastems_stemid) > 0):
- ptype = 'end'
- self.parastems_stemid=[]
- else:
+ (pos, temp) = self.findinDoc('paragraph',start,end)
+ if temp:
+ print 'Treating this like a "text" region'
+ regtype = 'fixed'
+ ptype = 'full'
+ # check to see if this is a continution from the previous page
+ if (len(self.parastems_stemid) > 0):
+ ptype = 'end'
+ self.parastems_stemid=[]
+ else:
+ if not anchorSet:
+ htmlpage += ' 
\n'
+ anchorSet = True
+ (pclass, pdesc) = self.getParaDescription(start,end)
+ if ptype == 'full' :
+ tag = 'p'
+ if pclass[3:6] == 'h1-' : tag = 'h4'
+ if pclass[3:6] == 'h2-' : tag = 'h5'
+ if pclass[3:6] == 'h3-' : tag = 'h6'
+ htmlpage += '<' + tag + ' class="' + pclass + '">'
+ htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
+ htmlpage += '' + tag + '>'
+ else :
+ htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
+ else :
+ print 'Treating this like a "image" region'
if not anchorSet:
htmlpage += ' 
\n'
anchorSet = True
- (pclass, desc) = self.getParaDescription(start,end)
- if ptype == 'full' :
- tag = 'p'
- if pclass[3:6] == 'h1-' : tag = 'h4'
- if pclass[3:6] == 'h2-' : tag = 'h5'
- if pclass[3:6] == 'h3-' : tag = 'h6'
- htmlpage += '<' + tag + ' class="' + pclass + '">'
- htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
- htmlpage += '' + tag + '>'
- else :
- htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
-
-
+ (pos, simgsrc) = self.findinDoc('img.src',start,end)
+ if simgsrc:
+ htmlpage += '' % int(simgsrc)
if len(self.paracont_stemid) > 0 :
if htmlpage[-4:] == '
':
diff --git a/Topaz_Tools/lib/genhtml.py b/Topaz_Tools/lib/genhtml.py
index 05261c9..df39539 100644
--- a/Topaz_Tools/lib/genhtml.py
+++ b/Topaz_Tools/lib/genhtml.py
@@ -8,7 +8,7 @@ import convert2xml
import flatxml2html
import decode_meta
import stylexml2css
-
+import getpagedim
def usage():
print 'Usage: '
@@ -86,6 +86,7 @@ def main(argv):
htmlstr += '\n'
+ # process metadata and retrieve fontSize info
print ' ', 'metadata0000.dat'
fname = os.path.join(bookDir,'metadata0000.dat')
xname = os.path.join(bookDir, 'metadata.txt')
@@ -100,12 +101,27 @@ def main(argv):
if 'fontSize' in meta_array:
fontsize = meta_array['fontSize']
+ # also get the size of a normal text page
+ spage = '1'
+ if 'firstTextPage' in meta_array:
+ spage = meta_array['firstTextPage']
+ pnum = int(spage)
+
+ # get page height and width from first text page for use in stylesheet scaling
+ pname = 'page%04d.dat' % pnum
+ fname = os.path.join(pageDir,pname)
+ flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
+ (ph, pw) = getpagedim.getPageDim(flat_xml)
+ if (ph == '-1') : ph = 11000
+ if (pw == '-1') : pw = 8500
+
+ # now build up the style sheet
print ' ', 'other0000.dat'
fname = os.path.join(bookDir,'other0000.dat')
xname = os.path.join(bookDir, 'style.css')
xmlstr = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
htmlstr += '\n'
diff --git a/Topaz_Tools/lib/getpagedim.py b/Topaz_Tools/lib/getpagedim.py
new file mode 100644
index 0000000..dd1071c
--- /dev/null
+++ b/Topaz_Tools/lib/getpagedim.py
@@ -0,0 +1,53 @@
+#! /usr/bin/python
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import with_statement
+import csv
+import sys
+import os
+import getopt
+from struct import pack
+from struct import unpack
+
+
+class DocParser(object):
+ def __init__(self, flatxml):
+ self.flatdoc = flatxml.split('\n')
+
+
+ # find tag if within pos to end inclusive
+ def findinDoc(self, tagpath, pos, end) :
+ result = None
+ docList = self.flatdoc
+ cnt = len(docList)
+ if end == -1 :
+ end = cnt
+ else:
+ end = min(cnt,end)
+ foundat = -1
+ for j in xrange(pos, end):
+ item = docList[j]
+ if item.find('=') >= 0:
+ (name, argres) = item.split('=')
+ else :
+ name = item
+ argres = ''
+ if name.endswith(tagpath) :
+ result = argres
+ foundat = j
+ break
+ return foundat, result
+
+ def process(self):
+ (pos, sph) = self.findinDoc('page.h',0,-1)
+ (pos, spw) = self.findinDoc('page.w',0,-1)
+ if (sph == None): sph = '-1'
+ if (spw == None): spw = '-1'
+ return sph, spw
+
+
+def getPageDim(flatxml):
+ # create a document parser
+ dp = DocParser(flatxml)
+ (ph, pw) = dp.process()
+ return ph, pw
diff --git a/Topaz_Tools/lib/readme.txt b/Topaz_Tools/lib/readme.txt
index afe4a5a..c9fcb61 100644
--- a/Topaz_Tools/lib/readme.txt
+++ b/Topaz_Tools/lib/readme.txt
@@ -3,7 +3,8 @@ Contributors:
clarknova - for all of the svg and glyph generation and many other bug fixes and improvements
skindle - for figuing out the general case for the mode loops
some updates - for conversion to xml, basic html
- DiapDealer - for extensive testing and feeback
+ DiapDealer - for extensive testing and feedback
+ stewball - for extensive testing and feedback
and others for posting, feedback and testing
@@ -23,12 +24,13 @@ decode_meta.py - converts metadata0000.dat to human readable text (for the most
convert2xml.py - converts page*.dat, other*.dat, and glyphs*.dat files to pseudo xml descriptions
flatxml2html.py - converts a "flattened" xml description to html using the ocrtext
stylexml2css.py - converts stylesheet "flattened" xml into css (as best it can)
+getpagedim.py - reads page0000.dat to get the book height and width parameters
genxml.py - main program to convert everything to xml
genhtml.py - main program to generate "book.html"
gensvg.py - (author: clarknova) main program to create an svg grpahic of each page
Please note, gensvg.py, genhtml.py, and genxml.py import and use
-decode_meta.py, convert2xml.py, flatxml2html.py, and stylexml2css.py
+decode_meta.py, convert2xml.py, flatxml2html.py, getpagedim.py and stylexml2css.py
so please keep all of these python scripts together in the same place.
diff --git a/Topaz_Tools/lib/stylexml2css.py b/Topaz_Tools/lib/stylexml2css.py
index ede6767..0d2739b 100644
--- a/Topaz_Tools/lib/stylexml2css.py
+++ b/Topaz_Tools/lib/stylexml2css.py
@@ -11,9 +11,11 @@ from struct import unpack
class DocParser(object):
- def __init__(self, flatxml, fontsize):
+ def __init__(self, flatxml, fontsize, ph, pw):
self.flatdoc = flatxml.split('\n')
self.fontsize = int(fontsize)
+ self.ph = int(ph) * 1.0
+ self.pw = int(pw) * 1.0
stags = {
'paragraph' : 'p',
@@ -106,14 +108,14 @@ class DocParser(object):
# get the style class
(pos, sclass) = self.findinDoc('style.class',start,end)
if sclass != None:
- sclass = '.cl_' + sclass.lower()
+ sclass = '.cl-' + sclass.lower()
else :
sclass = ''
# check for any "after class" specifiers
(pos, aftclass) = self.findinDoc('style._after_class',start,end)
if aftclass != None:
- aftclass = '.cl_' + aftclass.lower()
+ aftclass = '.cl-' + aftclass.lower()
else :
aftclass = ''
@@ -121,8 +123,8 @@ class DocParser(object):
while True :
- (pos, attr) = self.findinDoc('style.rule.attr', start, end)
- (pos, val) = self.findinDoc('style.rule.value', start, end)
+ (pos1, attr) = self.findinDoc('style.rule.attr', start, end)
+ (pos2, val) = self.findinDoc('style.rule.value', start, end)
if attr == None : break
@@ -135,28 +137,34 @@ class DocParser(object):
# handle value based attributes
if attr in self.attr_val_map :
name = self.attr_val_map[attr]
- scale = self.fontsize
- if attr == 'line-space': scale = scale * 1.41
+ if attr in ('margin-bottom', 'margin-top', 'space-after') :
+ scale = self.ph
+ elif attr in ('margin-right', 'indent', 'margin-left', 'hang') :
+ scale = self.pw
+ elif attr == 'line-space':
+ scale = self.fontsize * 2.0
+
if not ((attr == 'hang') and (int(val) == 0)) :
- ems = int(val)/scale
- cssargs[attr] = (self.attr_val_map[attr], ems)
+ pv = float(val)/scale
+ cssargs[attr] = (self.attr_val_map[attr], pv)
keep = True
- start = pos + 1
+ start = max(pos1, pos2) + 1
# disable all of the after class tags until I figure out how to handle them
if aftclass != "" : keep = False
if keep :
- # make sure line-space does not go below 1em
+ # make sure line-space does not go below 100% or above 300% since
+ # it can be wacky in some styles
if 'line-space' in cssargs:
seg = cssargs['line-space'][0]
val = cssargs['line-space'][1]
if val < 1.0: val = 1.0
+ if val > 3.0: val = 3.0
del cssargs['line-space']
cssargs['line-space'] = (self.attr_val_map['line-space'], val)
-
# handle modifications for css style hanging indents
if 'hang' in cssargs:
@@ -166,11 +174,13 @@ class DocParser(object):
cssargs['hang'] = (self.attr_val_map['hang'], -hval)
mval = 0
mseg = 'margin-left: '
+ mval = hval
if 'margin-left' in cssargs:
mseg = cssargs['margin-left'][0]
mval = cssargs['margin-left'][1]
+ if mval < 0: mval = 0
mval = hval + mval
- cssargs['margin-left'] = (mseg, mval)
+ cssargs['margin-left'] = (mseg, mval)
if 'indent' in cssargs:
del cssargs['indent']
@@ -181,7 +191,7 @@ class DocParser(object):
if mval == '':
cssline += mseg + ' '
else :
- aseg = mseg + '%.1fem;' % mval
+ aseg = mseg + '%.1f%%;' % (mval * 100.0)
cssline += aseg + ' '
cssline += '}'
@@ -213,10 +223,14 @@ class DocParser(object):
-def convert2CSS(flatxml, fontsize):
+def convert2CSS(flatxml, fontsize, ph, pw):
+
+ print ' ', 'Using font size:',fontsize
+ print ' ', 'Using page height:', ph
+ print ' ', 'Using page width:', pw
# create a document parser
- dp = DocParser(flatxml, fontsize)
+ dp = DocParser(flatxml, fontsize, ph, pw)
csspage = dp.process()