tools v1.5

This commit is contained in:
Apprentice Alf 2010-03-02 12:46:56 +00:00
parent 6fb13373cf
commit 8e7d2657a4
12 changed files with 98 additions and 43 deletions

View File

@ -1,5 +1,5 @@
#! /usr/bin/python #! /usr/bin/python
# For use in Topaz Scripts version 2.3 # For use in Topaz Scripts version 2.6
""" """

View File

@ -1,5 +1,5 @@
#!/usr/bin/python #!/usr/bin/python
# For use with Topaz Scripts Version 2.3 # For use with Topaz Scripts Version 2.6
class Unbuffered: class Unbuffered:
def __init__(self, stream): def __init__(self, stream):

View File

@ -1,6 +1,6 @@
#! /usr/bin/python #! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 2.4 # For use with Topaz Scripts Version 2.6
class Unbuffered: class Unbuffered:
def __init__(self, stream): def __init__(self, stream):
@ -315,6 +315,12 @@ class PageParser(object):
'version.findlists' : (1, 'scalar_text', 0, 0), 'version.findlists' : (1, 'scalar_text', 0, 0),
'version.page_num' : (1, 'scalar_text', 0, 0), 'version.page_num' : (1, 'scalar_text', 0, 0),
'version.page_type' : (1, 'scalar_text', 0, 0), 'version.page_type' : (1, 'scalar_text', 0, 0),
'version.bad_text' : (1, 'scalar_text', 0, 0),
'version.glyph_mismatch' : (1, 'scalar_text', 0, 0),
'version.margins' : (1, 'scalar_text', 0, 0),
'version.staggered_lines' : (1, 'scalar_text', 0, 0),
'version.paragraph_continuation' : (1, 'scalar_text', 0, 0),
'version.toc' : (1, 'scalar_text', 0, 0),
'stylesheet' : (1, 'snippets', 1, 0), 'stylesheet' : (1, 'snippets', 1, 0),
'style' : (1, 'snippets', 1, 0), 'style' : (1, 'snippets', 1, 0),
@ -662,16 +668,19 @@ class PageParser(object):
def process(self): def process(self):
# peek at the first bytes to see what type of file it is # peek at the first bytes to see what type of file it is
magic = self.fo.read(11) magic = self.fo.read(9)
if (magic[0:1] == 'p') and (magic[2:10] == '__PAGE__'): if (magic[0:1] == 'p') and (magic[2:9] == 'marker_'):
first_token = 'info' first_token = 'info'
elif (magic[0:1] == 'g') and (magic[2:11] == '__GLYPH__'): elif (magic[0:1] == 'p') and (magic[2:9] == '__PAGE_'):
skip = self.fo.read(1) skip = self.fo.read(2)
first_token = 'info'
elif (magic[0:1] == 'g') and (magic[2:9] == '__GLYPH'):
skip = self.fo.read(3)
first_token = 'info' first_token = 'info'
else : else :
# other0.dat file # other0.dat file
first_token = None first_token = None
self.fo.seek(-11,1) self.fo.seek(-9,1)
# main loop to read and build the document tree # main loop to read and build the document tree

View File

@ -1,6 +1,6 @@
#! /usr/bin/python #! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 2.3 # For use with Topaz Scripts Version 2.6
import csv import csv
import sys import sys

View File

@ -1,6 +1,6 @@
#! /usr/bin/python #! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 2.3 # For use with Topaz Scripts Version 2.6
import sys import sys
import csv import csv
@ -32,6 +32,8 @@ class DocParser(object):
self.link_id = [] self.link_id = []
self.link_title = [] self.link_title = []
self.link_page = [] self.link_page = []
self.link_href = []
self.link_type = []
self.dehyphen_rootid = [] self.dehyphen_rootid = []
self.paracont_stemid = [] self.paracont_stemid = []
self.parastems_stemid = [] self.parastems_stemid = []
@ -197,6 +199,7 @@ class DocParser(object):
# get the class # get the class
def getClass(self, pclass): def getClass(self, pclass):
nclass = pclass nclass = pclass
# class names are an issue given topaz may start them with numerals (not allowed), # class names are an issue given topaz may start them with numerals (not allowed),
# use a mix of cases (which cause some browsers problems), and actually # use a mix of cases (which cause some browsers problems), and actually
# attach numbers after "_reclustered*" to the end to deal classeses that inherit # attach numbers after "_reclustered*" to the end to deal classeses that inherit
@ -206,7 +209,10 @@ class DocParser(object):
# so we clean this up by lowercasing, prepend 'cl-', and getting any baseclass # so we clean this up by lowercasing, prepend 'cl-', and getting any baseclass
# that exists in the stylesheet first, and then adding this specific class # that exists in the stylesheet first, and then adding this specific class
# after # after
# also some class names have spaces in them so need to convert to dashes
if nclass != None : if nclass != None :
nclass = nclass.replace(' ','-')
classres = '' classres = ''
nclass = nclass.lower() nclass = nclass.lower()
nclass = 'cl-' + nclass nclass = 'cl-' + nclass
@ -334,7 +340,7 @@ class DocParser(object):
result.append(('svg', num)) result.append(('svg', num))
return pclass, result return pclass, result
# this type of paragrph may be made up of multiple spans, inline # this type of paragraph may be made up of multiple spans, inline
# word monograms (images), and words with semantic meaning, # word monograms (images), and words with semantic meaning,
# plus glyphs used to form starting letter of first word # plus glyphs used to form starting letter of first word
@ -391,6 +397,9 @@ class DocParser(object):
result.append(('img' + word_class, int(argres))) result.append(('img' + word_class, int(argres)))
word_class = '' word_class = ''
elif name.endswith('region.img.src'):
result.append(('img' + word_class, int(argres)))
if (sp_first != -1) and (sp_last != -1): if (sp_first != -1) and (sp_last != -1):
for wordnum in xrange(sp_first, sp_last): for wordnum in xrange(sp_first, sp_last):
result.append(('ocr', wordnum)) result.append(('ocr', wordnum))
@ -437,6 +446,8 @@ class DocParser(object):
if (type == 'end'): if (type == 'end'):
parares += ' ' parares += ' '
lstart = len(parares)
cnt = len(pdesc) cnt = len(pdesc)
for j in xrange( 0, cnt) : for j in xrange( 0, cnt) :
@ -450,9 +461,14 @@ class DocParser(object):
if handle_links: if handle_links:
link = self.link_id[num] link = self.link_id[num]
if (link > 0): if (link > 0):
linktype = self.link_type[link-1]
title = self.link_title[link-1] title = self.link_title[link-1]
if (title == "") or (parares.rfind(title) < 0): if (title == "") or (parares.rfind(title) < 0):
title='_link_' title=parares[lstart:]
if linktype == 'external' :
linkhref = self.link_href[link-1]
linkhtml = '<a href="%s">' % linkhref
else :
ptarget = self.link_page[link-1] - 1 ptarget = self.link_page[link-1] - 1
linkhtml = '<a href="#page%04d">' % ptarget linkhtml = '<a href="#page%04d">' % ptarget
linkhtml += title + '</a>' linkhtml += title + '</a>'
@ -461,6 +477,7 @@ class DocParser(object):
parares = parares[0:pos] + linkhtml + parares[pos+len(title):] parares = parares[0:pos] + linkhtml + parares[pos+len(title):]
else : else :
parares += linkhtml parares += linkhtml
lstart = len(parares)
if word == '_link_' : word = '' if word == '_link_' : word = ''
elif (link < 0) : elif (link < 0) :
if word == '_link_' : word = '' if word == '_link_' : word = ''
@ -532,6 +549,14 @@ class DocParser(object):
# collect link destination page numbers # collect link destination page numbers
self.link_page = self.getData('info.links.page',0,-1) self.link_page = self.getData('info.links.page',0,-1)
# collect link types (container versus external)
(pos, argres) = self.findinDoc('info.links.type',0,-1)
if argres : self.link_type = argres.split('|')
# collect link destinations
(pos, argres) = self.findinDoc('info.links.href',0,-1)
if argres : self.link_href = argres.split('|')
# collect link titles # collect link titles
(pos, argres) = self.findinDoc('info.links.title',0,-1) (pos, argres) = self.findinDoc('info.links.title',0,-1)
if argres : if argres :
@ -641,16 +666,18 @@ class DocParser(object):
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
elif (regtype == 'synth_fcvr.center') or (regtype == 'synth_text.center'): elif (regtype == 'synth_fcvr.center'):
(pos, simgsrc) = self.findinDoc('img.src',start,end) (pos, simgsrc) = self.findinDoc('img.src',start,end)
if simgsrc: if simgsrc:
htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc) htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
else : else :
print 'Warning: region type', regtype print ' Making region type', regtype,
(pos, temp) = self.findinDoc('paragraph',start,end) (pos, temp) = self.findinDoc('paragraph',start,end)
if pos != -1: (pos2, temp) = self.findinDoc('span',start,end)
print ' is a "text" region' if pos != -1 or pos2 != -1:
print ' a "text" region'
orig_regtype = regtype
regtype = 'fixed' regtype = 'fixed'
ptype = 'full' ptype = 'full'
# check to see if this is a continution from the previous page # check to see if this is a continution from the previous page
@ -658,6 +685,11 @@ class DocParser(object):
ptype = 'end' ptype = 'end'
first_para_continued = False first_para_continued = False
(pclass, pdesc) = self.getParaDescription(start,end, regtype) (pclass, pdesc) = self.getParaDescription(start,end, regtype)
if not pclass:
if orig_regtype.endswith('.right') : pclass = 'cl-right'
elif orig_regtype.endswith('.center') : pclass = 'cl-center'
elif orig_regtype.endswith('.left') : pclass = 'cl-left'
elif orig_regtype.endswith('.justify') : pclass = 'cl-justify'
if pclass and (ptype == 'full') and (len(pclass) >= 6): if pclass and (ptype == 'full') and (len(pclass) >= 6):
tag = 'p' tag = 'p'
if pclass[3:6] == 'h1-' : tag = 'h4' if pclass[3:6] == 'h1-' : tag = 'h4'
@ -669,7 +701,7 @@ class DocParser(object):
else : else :
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
else : else :
print ' is a "graphic" region' print ' a "graphic" region'
(pos, simgsrc) = self.findinDoc('img.src',start,end) (pos, simgsrc) = self.findinDoc('img.src',start,end)
if simgsrc: if simgsrc:
htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc) htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)

View File

@ -1,6 +1,6 @@
#! /usr/bin/python #! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 2.3 # For use with Topaz Scripts Version 2.6
class Unbuffered: class Unbuffered:
def __init__(self, stream): def __init__(self, stream):

View File

@ -1,6 +1,6 @@
#! /usr/bin/python #! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 2.3 # For use with Topaz Scripts Version 2.6
class Unbuffered: class Unbuffered:
def __init__(self, stream): def __init__(self, stream):

View File

@ -1,6 +1,6 @@
#! /usr/bin/python #! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 2.3 # For use with Topaz Scripts Version 2.6
class Unbuffered: class Unbuffered:
def __init__(self, stream): def __init__(self, stream):

View File

@ -1,6 +1,6 @@
#! /usr/bin/python #! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 2.3 # For use with Topaz Scripts Version 2.6
import csv import csv
import sys import sys

View File

@ -1,6 +1,6 @@
#! /usr/bin/python #! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 2.3 # For use with Topaz Scripts Version 2.6
import csv import csv
import sys import sys
@ -85,7 +85,10 @@ class DocParser(object):
def process(self): def process(self):
classlst = '' classlst = ''
csspage = '' csspage = '.cl-center { text-align: center; margin-left: auto; margin-right: auto; }\n'
csspage += '.cl-right { text-align: right; }\n'
csspage += '.cl-left { text-align: left; }\n'
csspage += '.cl-justify { text-align: justify; }\n'
# generate a list of each <style> starting point in the stylesheet # generate a list of each <style> starting point in the stylesheet
styleList= self.posinDoc('book.stylesheet.style') styleList= self.posinDoc('book.stylesheet.style')
@ -108,6 +111,7 @@ class DocParser(object):
# get the style class # get the style class
(pos, sclass) = self.findinDoc('style.class',start,end) (pos, sclass) = self.findinDoc('style.class',start,end)
if sclass != None: if sclass != None:
sclass = sclass.replace(' ','-')
sclass = '.cl-' + sclass.lower() sclass = '.cl-' + sclass.lower()
else : else :
sclass = '' sclass = ''
@ -115,6 +119,7 @@ class DocParser(object):
# check for any "after class" specifiers # check for any "after class" specifiers
(pos, aftclass) = self.findinDoc('style._after_class',start,end) (pos, aftclass) = self.findinDoc('style._after_class',start,end)
if aftclass != None: if aftclass != None:
aftclass = aftclass.replace(' ','-')
aftclass = '.cl-' + aftclass.lower() aftclass = '.cl-' + aftclass.lower()
else : else :
aftclass = '' aftclass = ''
@ -216,6 +221,7 @@ class DocParser(object):
if ctype == 'h3_' : if ctype == 'h3_' :
csspage += 'h6' + cssline + '\n' csspage += 'h6' + cssline + '\n'
if cssline != ' { }':
csspage += self.stags[tag] + cssline + '\n' csspage += self.stags[tag] + cssline + '\n'

View File

@ -1,4 +1,14 @@
Canges in 2.3 Changes in 2.6
- fix for many additional version tags
- fixes to generate better links
- fixes to handle external links
- now handles new "marker" page .dat files
- improved special region handling
- properly handle class names with spaces
- handle default alignment for synthetic regions
Changes in 2.3
- fix for use with non-latin1 based systems (thank you Tedd) - fix for use with non-latin1 based systems (thank you Tedd)
- fixes for out of order tokens in xml - fixes for out of order tokens in xml

View File

@ -53,8 +53,9 @@
# 0.12 - Fix added to prevent lowercasing of image names when the pml code itself uses a different case in the link name. # 0.12 - Fix added to prevent lowercasing of image names when the pml code itself uses a different case in the link name.
# 0.13 - change to unbuffered stdout for use with gui front ends # 0.13 - change to unbuffered stdout for use with gui front ends
# 0.14 - contributed enhancement to support --make-pmlz switch # 0.14 - contributed enhancement to support --make-pmlz switch
# 0.15 - enabled high-ascii to pml character encoding. DropBook now works on Mac.
__version__='0.14' __version__='0.15'
# Import Psyco if available # Import Psyco if available
try: try:
@ -465,17 +466,6 @@ class EreaderProcessor(object):
data = sect[62:] data = sect[62:]
return sanitizeFileName(name), data return sanitizeFileName(name), data
def cleanPML(self,pml):
# Update old \b font tag with correct \B bold font tag
pml2 = pml.replace('\\b', '\\B')
# Convert special characters to proper PML code. High ASCII start at (\x82, \a130) and go up to (\xff, \a255)
for k in xrange(130,256):
# a2b_hex takes in a hexidecimal as a string and converts it
# to a binary ascii code that we search and replace for
badChar=binascii.a2b_hex('%02x' % k)
pml2 = pml2.replace(badChar, '\\a%03d' % k)
#end for k
return pml2
# def getChapterNamePMLOffsetData(self): # def getChapterNamePMLOffsetData(self):
# cv = '' # cv = ''
@ -564,6 +554,14 @@ class EreaderProcessor(object):
return r return r
def cleanPML(pml):
# Convert special characters to proper PML code. High ASCII start at (\x80, \a128) and go up to (\xff, \a255)
pml2 = pml
for k in xrange(128,256):
badChar = chr(k)
pml2 = pml2.replace(badChar, '\\a%03d' % k)
return pml2
def convertEreaderToPml(infile, name, cc, outdir): def convertEreaderToPml(infile, name, cc, outdir):
if not os.path.exists(outdir): if not os.path.exists(outdir):
os.makedirs(outdir) os.makedirs(outdir)
@ -585,7 +583,7 @@ def convertEreaderToPml(infile, name, cc, outdir):
print " Extracting pml" print " Extracting pml"
pml_string = er.getText() pml_string = er.getText()
pmlfilename = bookname + ".pml" pmlfilename = bookname + ".pml"
file(os.path.join(outdir, pmlfilename),'wb').write(pml_string) file(os.path.join(outdir, pmlfilename),'wb').write(cleanPML(pml_string))
# bkinfo = er.getBookInfo() # bkinfo = er.getBookInfo()
# if bkinfo != '': # if bkinfo != '':
@ -677,7 +675,7 @@ def main(argv=None):
search_time = end_time - start_time search_time = end_time - start_time
print 'elapsed time: %.2f seconds' % (search_time, ) print 'elapsed time: %.2f seconds' % (search_time, )
if make_pmlz : if make_pmlz :
print 'output in %s' % zipname print 'output is %s' % zipname
else : else :
print 'output in %s' % outdir print 'output in %s' % outdir
print "done" print "done"