topazscripts 1.3 by some_updates

This commit is contained in:
some_updates 2010-01-19 12:11:59 +00:00 committed by Apprentice Alf
parent 0a437510f6
commit c1e5943471
6 changed files with 651 additions and 554 deletions

View File

@ -160,101 +160,159 @@ class PageParser(object):
# tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped) # tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped)
token_tags = { token_tags = {
'book' : (1, 'snippets', 1, 0), 'x' : (1, 'scalar_number', 0, 0),
'version' : (1, 'snippets', 1, 0), 'y' : (1, 'scalar_number', 0, 0),
'stylesheet' : (1, 'snippets', 1, 0),
'links' : (0, 'number', 0, 1),
'pages' : (0, 'number', 0, 1),
'page' : (1, 'snippets', 1, 0),
'group' : (1, 'snippets', 1, 0),
'region' : (1, 'snippets', 1, 0),
'reflow' : (1, 'number', 1, 0),
'img' : (1, 'snippets', 1, 0),
'paragraph' : (1, 'snippets', 1, 0),
'extratokens' : (1, 'snippets', 1, 0),
'style' : (1, 'snippets', 1, 0),
'rule' : (1, 'snippets', 1, 0),
'_span' : (1, 'snippets', 1, 0),
'word_semantic': (1, 'snippets', 1, 1),
'value' : (1, 'scalar_text', 0, 0),
'h' : (1, 'scalar_number', 0, 0), 'h' : (1, 'scalar_number', 0, 0),
'w' : (1, 'scalar_number', 0, 0), 'w' : (1, 'scalar_number', 0, 0),
'firstWord' : (1, 'scalar_number', 0, 0), 'firstWord' : (1, 'scalar_number', 0, 0),
'lastWord' : (1, 'scalar_number', 0, 0), 'lastWord' : (1, 'scalar_number', 0, 0),
'x' : (1, 'number', 0, 0), 'rootID' : (1, 'scalar_number', 0, 0),
'y' : (1, 'number', 0, 0), 'stemID' : (1, 'scalar_number', 0, 0),
'type' : (1, 'scalar_text', 0, 0),
'info' : (0, 'number', 1, 0),
'info.word' : (0, 'number', 1, 1),
'info.word.ocrText' : (1, 'text', 0, 0),
'info.word.firstGlyph' : (1, 'raw', 0, 0),
'info.word.lastGlyph' : (1, 'raw', 0, 0),
'info.word.bl' : (1, 'raw', 0, 0),
'info.word.link_id' : (1, 'number', 0, 0),
'glyph' : (0, 'number', 1, 1),
'glyph.x' : (1, 'number', 0, 0),
'glyph.y' : (1, 'number', 0, 0),
'glyph.glyphID' : (1, 'number', 0, 0),
'dehyphen' : (0, 'number', 1, 1),
'dehyphen.rootID' : (1, 'number', 0, 0),
'dehyphen.stemID' : (1, 'number', 0, 0),
'dehyphen.stemPage' : (1, 'number', 0, 0),
'dehyphen.sh' : (1, 'number', 0, 0),
'links' : (0, 'number', 1, 1),
'links.page' : (1, 'number', 0, 0), 'links.page' : (1, 'number', 0, 0),
'link_id' : (1, 'number', 0, 0), 'links.rel' : (1, 'number', 0, 0),
'glyph' : (0, 'number', 1, 1), 'links.row' : (1, 'number', 0, 0),
'links.title' : (1, 'text', 0, 0),
'links.href' : (1, 'text', 0, 0),
'links.type' : (1, 'text', 0, 0),
'paraCont' : (0, 'number', 1, 1),
'paraCont.rootID' : (1, 'number', 0, 0),
'paraCont.stemID' : (1, 'number', 0, 0),
'paraCont.stemPage' : (1, 'number', 0, 0),
'paraStems' : (0, 'number', 1, 1),
'paraStems.stemID' : (1, 'number', 0, 0),
'wordStems' : (0, 'number', 1, 1),
'wordStems.stemID' : (1, 'number', 0, 0),
'page' : (1, 'snippets', 1, 0),
'page.pageid' : (1, 'scalar_text', 0, 0),
'page.pagelabel' : (1, 'scalar_text', 0, 0),
'page.type' : (1, 'scalar_text', 0, 0),
'page.h' : (1, 'scalar_number', 0, 0),
'page.w' : (1, 'scalar_number', 0, 0),
'page.startID' : (1, 'scalar_number', 0, 0),
'group' : (1, 'snippets', 1, 0),
'group.type' : (1, 'scalar_text', 0, 0),
'region' : (1, 'snippets', 1, 0),
'region.type' : (1, 'scalar_text', 0, 0),
'region.x' : (1, 'scalar_number', 0, 0),
'region.y' : (1, 'scalar_number', 0, 0),
'region.h' : (1, 'scalar_number', 0, 0),
'region.w' : (1, 'scalar_number', 0, 0),
'img' : (1, 'snippets', 1, 0),
'img.x' : (1, 'scalar_number', 0, 0),
'img.y' : (1, 'scalar_number', 0, 0),
'img.h' : (1, 'scalar_number', 0, 0),
'img.w' : (1, 'scalar_number', 0, 0),
'img.src' : (1, 'scalar_number', 0, 0),
'paragraph' : (1, 'snippets', 1, 0),
'paragraph.class' : (1, 'scalar_text', 0, 0),
'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
'word_semantic' : (1, 'snippets', 1, 1),
'word_semantic.type' : (1, 'scalar_text', 0, 0),
'word_semantic.firstWord' : (1, 'scalar_number', 0, 0),
'word_semantic.lastWord' : (1, 'scalar_number', 0, 0),
'word' : (1, 'snippets', 1, 0),
'word.type' : (1, 'scalar_text', 0, 0),
'word.class' : (1, 'scalar_text', 0, 0),
'_span' : (1, 'snippets', 1, 0),
'_span.firstWord' : (1, 'scalar_number', 0, 0),
'-span.lastWord' : (1, 'scalar_number', 0, 0),
'extratokens' : (1, 'snippets', 1, 0),
'extratokens.type' : (1, 'scalar_text', 0, 0),
'extratokens.firstGlyph' : (1, 'scalar_number', 0, 0),
'extratokens.lastGlyph' : (1, 'scalar_number', 0, 0),
'glyph.h' : (1, 'number', 0, 0), 'glyph.h' : (1, 'number', 0, 0),
'glyph.w' : (1, 'number', 0, 0), 'glyph.w' : (1, 'number', 0, 0),
'sh' : (1, 'number', 0, 0), 'glyph.use' : (1, 'number', 0, 0),
'word' : (0, 'number', 1, 1), 'glyph.vtx' : (1, 'number', 0, 1),
'src' : (1, 'scalar_number', 0, 0), 'glyph.len' : (1, 'number', 0, 1),
'rel' : (1, 'number', 0, 0), 'glyph.dpi' : (1, 'number', 0, 0),
'row' : (1, 'number', 0, 0), 'vtx' : (0, 'number', 1, 1),
'startID' : (1, 'number', 0, 1), 'vtx.x' : (1, 'number', 0, 0),
'vtx.y' : (1, 'number', 0, 0),
'len' : (0, 'number', 1, 1),
'len.n' : (1, 'number', 0, 0),
'book' : (1, 'snippets', 1, 0),
'version' : (1, 'snippets', 1, 0),
'version.FlowEdit_1_id' : (1, 'scalar_text', 0, 0),
'version.FlowEdit_1_version' : (1, 'scalar_text', 0, 0),
'version.Schema_id' : (1, 'scalar_text', 0, 0),
'version.Schema_version' : (1, 'scalar_text', 0, 0),
'version.Topaz_version' : (1, 'scalar_text', 0, 0),
'version.WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0),
'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0),
'version.ZoneEdit_1_id' : (1, 'scalar_text', 0, 0),
'version.ZoneEdit_1_version' : (1, 'scalar_text', 0, 0),
'version.chapterheaders' : (1, 'scalar_text', 0, 0),
'version.creation_date' : (1, 'scalar_text', 0, 0),
'version.header_footer' : (1, 'scalar_text', 0, 0),
'version.init_from_ocr' : (1, 'scalar_text', 0, 0),
'version.letter_insertion' : (1, 'scalar_text', 0, 0),
'version.xmlinj_convert' : (1, 'scalar_text', 0, 0),
'version.xmlinj_reflow' : (1, 'scalar_text', 0, 0),
'version.xmlinj_transform' : (1, 'scalar_text', 0, 0),
'version.findlists' : (1, 'scalar_text', 0, 0),
'version.page_num' : (1, 'scalar_text', 0, 0),
'version.page_type' : (1, 'scalar_text', 0, 0),
'stylesheet' : (1, 'snippets', 1, 0),
'style' : (1, 'snippets', 1, 0),
'style._tag' : (1, 'scalar_text', 0, 0),
'style.type' : (1, 'scalar_text', 0, 0),
'style._parent_type' : (1, 'scalar_text', 0, 0),
'style.class' : (1, 'scalar_text', 0, 0),
'style._after_class' : (1, 'scalar_text', 0, 0),
'rule' : (1, 'snippets', 1, 0),
'rule.attr' : (1, 'scalar_text', 0, 0),
'rule.value' : (1, 'scalar_text', 0, 0),
'original' : (0, 'number', 1, 1),
'original.pnum' : (1, 'number', 0, 0),
'original.pid' : (1, 'text', 0, 0),
'pages' : (0, 'number', 1, 1),
'pages.ref' : (1, 'number', 0, 0),
'pages.id' : (1, 'number', 0, 0),
'startID' : (0, 'number', 1, 1),
'startID.page' : (1, 'number', 0, 0), 'startID.page' : (1, 'number', 0, 0),
'glyphID' : (1, 'number', 0, 0), 'startID.id' : (1, 'number', 0, 0),
'rootID' : (1, 'number', 0, 0),
'stemID' : (1, 'number', 0, 0),
'margin-top' : (1, 'number', 0, 0),
'stemPage' : (1, 'number', 0, 0),
'dehyphen' : (1, 'number', 1, 1),
'rootID' : (1, 'number', 0, 0),
'paraCont' : (1, 'number', 1, 1),
'paraStems' : (1, 'number', 1, 1),
'wordStems' : (1, 'number', 1, 1),
'original' : (0, 'number', 0, 1),
'use' : (1, 'number', 0, 0),
'vtx' : (1, 'number', 0, 1),
'len' : (1, 'number', 0, 1),
'dpi' : (1, 'number', 0, 0),
'n' : (1, 'number', 0, 0),
'id' : (1, 'number', 0, 0),
'ref' : (1, 'number', 0, 0),
'pnum' : (1, 'number', 0, 0),
'pid' : (1, 'text', 0, 0),
'info' : (0, 'number', 1, 0),
'bl' : (1, 'raw', 0, 0),
'firstGlyph' : (1, 'raw', 0, 0),
'lastGlyph' : (1, 'raw', 0, 0),
'ocrText' : (1, 'text', 0, 0),
'title' : (1, 'text', 0, 0),
'href' : (1, 'text', 0, 0),
'_parent_type' : (1, 'text', 0, 0),
'attr' : (1, 'scalar_text', 0, 0),
'justify' : (1, 'scalar_text', 0, 0),
'align' : (1, 'scalar_text', 0, 0),
'layout' : (1, 'scalar_text', 0, 0),
'pageid' : (1, 'scalar_text', 0, 0),
'pagelabel' : (1, 'scalar_text', 0, 0),
'type' : (1, 'text', 0, 0),
'class' : (1, 'scalar_text', 0, 0),
'container' : (1, 'scalar_text', 0, 0),
'_after_class' : (1, 'scalar_text', 0, 0),
'_tag' : (1, 'scalar_text', 0, 0),
'pos' : (1, 'scalar_text', 0, 0),
'page_num' : (1, 'scalar_text', 0, 0),
'page_type' : (1, 'scalar_text', 0, 0),
'findlists' : (1, 'scalar_text', 0, 0),
'FlowEdit_1_id' : (1, 'scalar_text', 0, 0),
'FlowEdit_1_version' : (1, 'scalar_text', 0, 0),
'Schema_id' : (1, 'scalar_text', 0, 0),
'Schema_version' : (1, 'scalar_text', 0, 0),
'Topaz_version' : (1, 'scalar_text', 0, 0),
'WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0),
'WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0),
'ZoneEdit_1_id' : (1, 'scalar_text', 0, 0),
'ZoneEdit_1_version' : (1, 'scalar_text', 0, 0),
'chapterheaders' : (1, 'scalar_text', 0, 0),
'creation_date' : (1, 'scalar_text', 0, 0),
'header_footer' : (1, 'scalar_text', 0, 0),
'init_from_ocr' : (1, 'scalar_text', 0, 0),
'letter_insertion' : (1, 'scalar_text', 0, 0),
'xmlinj_convert' : (1, 'scalar_text', 0, 0),
'xmlinj_reflow' : (1, 'scalar_text', 0, 0),
'xmlinj_transform' : (1, 'scalar_text', 0, 0),
} }
@ -404,101 +462,25 @@ class PageParser(object):
return return
# loop: pass though values unchanged
# DO NOT CHANGE - this has proven to be correct # general loop code gracisouly submitted by "skindle" - thank you!
def doLoop76Mode0(self, argtype, cnt): def doLoop76Mode(self, argtype, cnt, mode):
result = [] result = []
adj = 0
if mode & 1:
adj = readEncodedNumber(self.fo)
mode = mode >> 1
x = []
for i in xrange(cnt): for i in xrange(cnt):
result.append(self.formatArg(readEncodedNumber(self.fo), argtype)) x.append(readEncodedNumber(self.fo) - adj)
return result for i in xrange(mode):
for j in xrange(1, cnt):
x[j] = x[j] + x[j - 1]
# loop generating values relative to the *negative*
# of the offset - don't ask why - it just is
# DO NOT CHANGE - this has proven to be correct
def doLoop76Mode1(self, argtype, cnt):
result = []
offset = -readEncodedNumber(self.fo)
for i in xrange(cnt): for i in xrange(cnt):
val = readEncodedNumber(self.fo) + offset result.append(self.formatArg(x[i],argtype))
result.append(self.formatArg(val, argtype))
return result return result
# loop generating values with starting value and accumulation
# DO NOT CHANGE - this has proven to be the correct
def doLoop76Mode2(self, argtype, cnt):
result = []
ptr = readEncodedNumber(self.fo)
result.append(self.formatArg(ptr, argtype))
for i in xrange(cnt-1):
ptr = ptr + readEncodedNumber(self.fo)
result.append(self.formatArg(ptr, argtype))
return result
# loop generating values with starting value and accumulation
# **after** subtracting adjustment value from each
# DO NOT CHANGE - this has been proven to be correct
def doLoop76Mode3(self, argtype, cnt):
result = []
adj = readEncodedNumber(self.fo)
ptr = readEncodedNumber(self.fo)
ptr = ptr - adj
result.append(self.formatArg(ptr, argtype))
for i in xrange(cnt-1):
ptr = ptr + readEncodedNumber(self.fo) - adj
result.append(self.formatArg(ptr,argtype))
return result
# loop using runing sum of data values and starting value
# with accumulation to get new value
# Again, don't ask it took me forever to figure this out
# DO NOT CHANGE - this has been proven to be correct
def doLoop76Mode4(self, argtype, cnt):
result = []
val = readEncodedNumber(self.fo)
runsum = val
ptr = val
result.append(self.formatArg(ptr, argtype))
for i in xrange(cnt-1):
runsum += readEncodedNumber(self.fo)
ptr = ptr + runsum
result.append(self.formatArg(ptr,argtype))
return result
# loop using and extra value as an adjustment
# and a running sum of the values after subtracting
# the adjustment, added to a ptr to get a new pointer
def doLoop76Mode5(self, argtype, cnt):
result = []
adj = readEncodedNumber(self.fo)
ptr = 0
runsum = 0
for i in xrange(cnt):
val = readEncodedNumber(self.fo)
runsum += (val - adj)
ptr = ptr +runsum
result.append(self.formatArg(ptr,argtype))
return result
# FIXME: I have only 4 points to work this out with inside my book
# So may be wrong but it is correct for my 4 points
def doLoop76Mode6(self, argtype, cnt):
result = []
oldval = 0
for i in xrange(cnt):
val = readEncodedNumber(self.fo)
ptr= (3 * oldval) + val + 1
result.append(self.formatArg(ptr,argtype))
oldval = val
return result
# dispatches loop commands bytes with various modes # dispatches loop commands bytes with various modes
# The 0x76 style loops are used to build vectors # The 0x76 style loops are used to build vectors
@ -507,58 +489,21 @@ class PageParser(object):
# since they did not appear in the test cases # since they did not appear in the test cases
def decodeCMD(self, cmd, argtype): def decodeCMD(self, cmd, argtype):
# if (cmd == 0x72):
# self.doLoop72(argtype)
# result =[]
# return result
if (cmd == 0x76): if (cmd == 0x76):
# loop with cnt, and mode to control loop styles # loop with cnt, and mode to control loop styles
cnt = readEncodedNumber(self.fo) cnt = readEncodedNumber(self.fo)
mode = readEncodedNumber(self.fo) mode = readEncodedNumber(self.fo)
if self.debug : print 'Loop for', cnt, 'with mode', mode, ': ' if self.debug : print 'Loop for', cnt, 'with mode', mode, ': '
return self.doLoop76Mode(argtype, cnt, mode)
if (mode == 0x00):
return self.doLoop76Mode0(argtype, cnt)
elif (mode == 0x01):
return self.doLoop76Mode1(argtype, cnt)
elif (mode == 0x02):
return self.doLoop76Mode2(argtype, cnt)
elif (mode == 0x03):
return self.doLoop76Mode3(argtype, cnt)
elif (mode == 0x04):
return self.doLoop76Mode4(argtype, cnt)
elif (mode == 0x05):
return self.doLoop76Mode5(argtype, cnt)
elif (mode == 0x06):
return self.doLoop76Mode6(argtype, cnt)
else:
if self.debug :
# try to mark any unknown loop comands
# if they exist, unless they are used to process
# text or some other known list, we won't be able to prove them correct
print '*** Unknown Loop 0x%x %d %d :' % (cmd, cnt, mode)
for i in xrange(cnt):
val = readEncodedNumber(self.fo)
print ' 0x%x' % val,
print ' '
result = []
return result
if self.dbug: print "Unknown command", cmd if self.dbug: print "Unknown command", cmd
result = [] result = []
return result return result
# add full tag path to injected snippets # add full tag path to injected snippets
def updateName(self, tag, prefix): def updateName(self, tag, prefix):
name = tag[0] name = tag[0]
@ -727,7 +672,7 @@ class PageParser(object):
self.doc.append(tag) self.doc.append(tag)
else: else:
if self.debug: if self.debug:
print "Mina Loop: Unknown value: %x" % v print "Main Loop: Unknown value: %x" % v
# now do snippet injection # now do snippet injection

View File

@ -11,9 +11,16 @@ from struct import unpack
class DocParser(object): class DocParser(object):
def __init__(self, flatxml, fileid): def __init__(self, flatxml, classlst, fileid):
self.id = os.path.basename(fileid).replace('.dat','') self.id = os.path.basename(fileid).replace('.dat','')
self.flatdoc = flatxml.split('\n') self.flatdoc = flatxml.split('\n')
self.classList = {}
tmpList = classlst.split('\n')
for pclass in tmpList:
if pclass != '':
# remove the leading period from the css name
cname = pclass[1:]
self.classList[cname] = True
self.ocrtext = [] self.ocrtext = []
self.link_id = [] self.link_id = []
self.link_title = [] self.link_title = []
@ -22,6 +29,18 @@ class DocParser(object):
self.paracont_stemid = [] self.paracont_stemid = []
self.parastems_stemid = [] self.parastems_stemid = []
# find tag if within pos to end inclusive
def lineinDoc(self, pos) :
docList = self.flatdoc
cnt = len(docList)
if (pos >= 0) and (pos < cnt) :
item = docList[pos]
if item.find('=') >= 0:
(name, argres) = item.split('=',1)
else :
name = item
argres = ''
return name, argres
# find tag if within pos to end inclusive # find tag if within pos to end inclusive
@ -61,91 +80,161 @@ class DocParser(object):
return startpos return startpos
# get a description of the paragraph # build a description of the paragraph
def getParaDescription(self, start, end): def getParaDescription(self, start, end):
result = []
# normal paragraph # normal paragraph
(pos, pclass) = self.findinDoc('paragraph.class',start,end) (pos, pclass) = self.findinDoc('paragraph.class',start,end)
# class names are an issue given topaz starts them with numerals (not allowed) # class names are an issue given topaz may start them with numerals (not allowed),
# use a mix of cases, (which cause some browsers problems), and actually # use a mix of cases (which cause some browsers problems), and actually
# attach numbers after "reclustered*" to the end to deal with reflow issues # attach numbers after "_reclustered*" to the end to deal with reflow issues
# so we clean this up by lowercasing, prepend 'cl_', and remove all end pieces after reclustered # but then not actually provide all of these _reclustereed classes in the stylesheet!
# so we clean this up by lowercasing, prepend 'cl_', and if not in the class
# list from the stylesheet, trying once more with "_reclustered*" removed
# if still not in stylesheet, let it pass as is
pclass = pclass.lower() pclass = pclass.lower()
pclass = 'cl_' + pclass pclass = 'cl_' + pclass
p = pclass.find('reclustered') if pclass not in self.classList:
if p > 0 : pclass = pclass[0:p+11] p = pclass.find('_reclustered')
if p > 0 :
baseclass = pclass[0:p]
if baseclass in self.classList:
pclass = baseclass
# build up a description of the paragraph in result and return it
# first check for the basic - all words paragraph
(pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end) (pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
(pos, slast) = self.findinDoc('paragraph.lastWord',start,end) (pos, slast) = self.findinDoc('paragraph.lastWord',start,end)
if (sfirst != None) and (slast != None) : if (sfirst != None) and (slast != None) :
return pclass, int(sfirst), int(slast) first = int(sfirst)
last = int(slast)
for wordnum in xrange(first, last):
result.append(('ocr', wordnum))
return pclass, result
# some paragraphs are instead split into multiple spans and some even have word_semantic tags as well # this type of paragrph may be made up of multiple _spans, inline
# so walk through this region keeping track of the first firstword, and the last lastWord # word monograms (images) and words with semantic meaning
# on any items that have it
(pos, sfirst) = self.findinDoc('firstWord',start, end) # need to parse this type line by line
first = int(sfirst) line = start + 1
last = -1 word_class = ''
for i in xrange(pos+1,end):
(pos, slast) = self.findinDoc('lastWord',i,i+1) while (line < end) :
if slast != None:
last = int(slast) (name, argres) = self.lineinDoc(line)
return pclass, first, last
if name.endswith('_span.firstWord') :
first = int(argres)
(name, argres) = self.lineinDoc(line+1)
if not name.endswith('_span.lastWord'):
print 'Error: - incorrect _span ordering inside paragraph'
last = int(argres)
for wordnum in xrange(first, last):
result.append(('ocr', wordnum))
line += 1
elif name.endswith('word.class'):
(cname, space) = argres.split('-',1)
if cname == 'spaceafter':
word_class = 'sa'
elif name.endswith('word.img.src'):
result.append(('img' + word_class, int(argres)))
word_class = ''
elif name.endswith('word_semantic.firstWord'):
first = int(argres)
(name, argres) = self.lineinDoc(line+1)
if not name.endswith('word_semantic.lastWord'):
print 'Error: - incorrect word_semantic ordering inside paragraph'
last = int(argres)
for wordnum in xrange(first, last):
result.append(('ocr', wordnum))
line += 1
line += 1
return pclass, result
def buildParagraph(self, cname, first, last, type, regtype) : def buildParagraph(self, cname, pdesc, type, regtype) :
parares = '' parares = ''
sep ='' sep =''
br_lb = False br_lb = False
if (regtype == 'fixed') or (regtype == 'chapterheading') : if (regtype == 'fixed') or (regtype == 'chapterheading') :
br_lb = True br_lb = True
handle_links = False handle_links = False
if len(self.link_id) > 0: if len(self.link_id) > 0:
handle_links = True handle_links = True
if (type == 'full') or (type == 'begin') : if (type == 'full') or (type == 'begin') :
parares += '<p class="' + cname + '">' parares += '<p class="' + cname + '">'
if (type == 'end'): if (type == 'end'):
parares += ' ' parares += ' '
for j in xrange(first, last) :
word = self.ocrtext[j]
sep = ' '
if handle_links: cnt = len(pdesc)
link = self.link_id[j]
if (link > 0): for j in xrange( 0, cnt) :
title = self.link_title[link-1]
if title == "": title='_link_' (wtype, num) = pdesc[j]
ptarget = self.link_page[link-1] - 1
linkhtml = '<a href="#page%04d">' % ptarget if wtype == 'ocr' :
linkhtml += title + '</a>' word = self.ocrtext[num]
pos = parares.rfind(title) sep = ' '
if pos >= 0:
parares = parares[0:pos] + linkhtml + parares[pos+len(title):] if handle_links:
link = self.link_id[num]
if (link > 0):
title = self.link_title[link-1]
if title == "": title='_link_'
ptarget = self.link_page[link-1] - 1
linkhtml = '<a href="#page%04d">' % ptarget
linkhtml += title + '</a>'
pos = parares.rfind(title)
if pos >= 0:
parares = parares[0:pos] + linkhtml + parares[pos+len(title):]
else :
parares += linkhtml
if word == '_link_' : word = ''
elif (link < 0) :
if word == '_link_' : word = ''
if word == '_lb_':
if (num-1) in self.dehyphen_rootid :
word = ''
sep = ''
elif handle_links :
word = ''
sep = ''
elif br_lb :
word = '<br />\n'
sep = ''
else : else :
parares += linkhtml word = '\n'
if word == '_link_' : word = '' sep = ''
elif (link < 0) :
if word == '_link_' : word = ''
if word == '_lb_': if num in self.dehyphen_rootid :
if (j-1) in self.dehyphen_rootid : word = word[0:-1]
word = ''
sep = ''
elif handle_links :
word = ''
sep = ''
elif br_lb :
word = '<br />\n'
sep = ''
else :
word = '\n'
sep = '' sep = ''
if j in self.dehyphen_rootid : parares += word + sep
word = word[0:-1]
elif wtype == 'img' :
sep = '' sep = ''
parares += '<img src="img/img%04d.jpg" alt="" />' % num
parares += sep
parares += word + sep elif wtype == 'imgsa' :
sep = ' '
parares += '<img src="img/img%04d.jpg" alt="" />' % num
parares += sep
if len(sep) > 0 : parares = parares[0:-1] if len(sep) > 0 : parares = parares[0:-1]
if (type == 'full') or (type == 'end') : if (type == 'full') or (type == 'end') :
@ -222,7 +311,7 @@ class DocParser(object):
htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc) htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
elif regtype == 'chapterheading' : elif regtype == 'chapterheading' :
(pclass, first, last) = self.getParaDescription(start,end) (pclass, pdesc) = self.getParaDescription(start,end)
if not breakSet: if not breakSet:
htmlpage += '<div style="page-break-after: always;">&nbsp;</div>\n' htmlpage += '<div style="page-break-after: always;">&nbsp;</div>\n'
breakSet = True breakSet = True
@ -234,7 +323,7 @@ class DocParser(object):
if pclass[3:7] == 'ch2-' : tag = 'h2' if pclass[3:7] == 'ch2-' : tag = 'h2'
if pclass[3:7] == 'ch3-' : tag = 'h3' if pclass[3:7] == 'ch3-' : tag = 'h3'
htmlpage += '<' + tag + ' class="' + pclass + '">' htmlpage += '<' + tag + ' class="' + pclass + '">'
htmlpage += self.buildParagraph(pclass,first,last,'middle', regtype) htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
htmlpage += '</' + tag + '>' htmlpage += '</' + tag + '>'
elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') : elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') :
@ -247,17 +336,17 @@ class DocParser(object):
if not anchorSet: if not anchorSet:
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n' htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
anchorSet = True anchorSet = True
(pclass, first, last) = self.getParaDescription(start,end) (pclass, pdesc) = self.getParaDescription(start,end)
if ptype == 'full' : if ptype == 'full' :
tag = 'p' tag = 'p'
if pclass[3:6] == 'h1-' : tag = 'h4' if pclass[3:6] == 'h1-' : tag = 'h4'
if pclass[3:6] == 'h2-' : tag = 'h5' if pclass[3:6] == 'h2-' : tag = 'h5'
if pclass[3:6] == 'h3-' : tag = 'h6' if pclass[3:6] == 'h3-' : tag = 'h6'
htmlpage += '<' + tag + ' class="' + pclass + '">' htmlpage += '<' + tag + ' class="' + pclass + '">'
htmlpage += self.buildParagraph(pclass, first, last, 'middle', regtype) htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
htmlpage += '</' + tag + '>' htmlpage += '</' + tag + '>'
else : else :
htmlpage += self.buildParagraph(pclass, first, last, ptype, regtype) htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
elif (regtype == 'tocentry') : elif (regtype == 'tocentry') :
@ -271,12 +360,43 @@ class DocParser(object):
if not anchorSet: if not anchorSet:
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n' htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
anchorSet = True anchorSet = True
(pclass, first, last) = self.getParaDescription(start,end) (pclass, pdesc) = self.getParaDescription(start,end)
htmlpage += self.buildParagraph(pclass, first, last, ptype, regtype) htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
elif regtype == 'synth_fcvr.center' :
if not anchorSet:
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
anchorSet = True
(pos, simgsrc) = self.findinDoc('img.src',start,end)
if simgsrc:
htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
else : else :
print 'Unknown region type', regtype print 'Warning: Unknown region type', regtype
print 'Warning: skipping this region' print 'Treating this like a "fixed" region'
regtype = 'fixed'
ptype = 'full'
# check to see if this is a continution from the previous page
if (len(self.parastems_stemid) > 0):
ptype = 'end'
self.parastems_stemid=[]
else:
if not anchorSet:
htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
anchorSet = True
(pclass, desc) = self.getParaDescription(start,end)
if ptype == 'full' :
tag = 'p'
if pclass[3:6] == 'h1-' : tag = 'h4'
if pclass[3:6] == 'h2-' : tag = 'h5'
if pclass[3:6] == 'h3-' : tag = 'h6'
htmlpage += '<' + tag + ' class="' + pclass + '">'
htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
htmlpage += '</' + tag + '>'
else :
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
if len(self.paracont_stemid) > 0 : if len(self.paracont_stemid) > 0 :
if htmlpage[-4:] == '</p>': if htmlpage[-4:] == '</p>':
@ -289,10 +409,10 @@ class DocParser(object):
def convert2HTML(flatxml, fileid): def convert2HTML(flatxml, classlst, fileid):
# create a document parser # create a document parser
dp = DocParser(flatxml, fileid) dp = DocParser(flatxml, classlst, fileid)
htmlpage = dp.process() htmlpage = dp.process()

View File

@ -95,22 +95,27 @@ def main(argv):
htmlstr += '<meta name="Author" content="' + meta_array['Authors'] + '" />\n' htmlstr += '<meta name="Author" content="' + meta_array['Authors'] + '" />\n'
htmlstr += '<meta name="Title" content="' + meta_array['Title'] + '" />\n' htmlstr += '<meta name="Title" content="' + meta_array['Title'] + '" />\n'
# get some scaling info from metadata to use while processing styles
fontsize = '135'
if 'fontSize' in meta_array:
fontsize = meta_array['fontSize']
print ' ', 'other0000.dat' print ' ', 'other0000.dat'
fname = os.path.join(bookDir,'other0000.dat') fname = os.path.join(bookDir,'other0000.dat')
xname = os.path.join(bookDir, 'style.css') xname = os.path.join(bookDir, 'style.css')
xmlstr = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname) xmlstr = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
cssstr = '<style>\n' htmlstr += '<style>\n'
cssstr += stylexml2css.convert2CSS(xmlstr) cssstr , classlst = stylexml2css.convert2CSS(xmlstr, fontsize)
cssstr += '</style>\n'
file(xname, 'wb').write(cssstr) file(xname, 'wb').write(cssstr)
htmlstr += cssstr htmlstr += cssstr
htmlstr += '</style>\n'
htmlstr += '</head>\n<body>\n' htmlstr += '</head>\n<body>\n'
for filename in filenames: for filename in filenames:
print ' ', filename print ' ', filename
fname = os.path.join(pageDir,filename) fname = os.path.join(pageDir,filename)
flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname) flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
htmlstr += flatxml2html.convert2HTML(flat_xml, fname) htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname)
htmlstr += '</body>\n</html>\n' htmlstr += '</body>\n</html>\n'

View File

@ -10,286 +10,301 @@ import decode_meta
class GParser(object): class GParser(object):
def __init__(self, flatxml): def __init__(self, flatxml):
self.flatdoc = flatxml.split('\n') self.flatdoc = flatxml.split('\n')
self.dpi = 1440 self.dpi = 1440
self.gh = self.getData('info.glyph.h') self.gh = self.getData('info.glyph.h')
self.gw = self.getData('info.glyph.w') self.gw = self.getData('info.glyph.w')
self.guse = self.getData('info.glyph.use') self.guse = self.getData('info.glyph.use')
self.count = len(self.guse) self.count = len(self.guse)
self.gvtx = self.getData('info.glyph.vtx') self.gvtx = self.getData('info.glyph.vtx')
self.glen = self.getData('info.glyph.len') self.glen = self.getData('info.glyph.len')
self.gdpi = self.getData('info.glyph.dpi') self.gdpi = self.getData('info.glyph.dpi')
self.vx = self.getData('info.vtx.x') self.vx = self.getData('info.vtx.x')
self.vy = self.getData('info.vtx.y') self.vy = self.getData('info.vtx.y')
self.vlen = self.getData('info.len.n') self.vlen = self.getData('info.len.n')
self.glen.append(len(self.vlen)) self.glen.append(len(self.vlen))
self.gvtx.append(len(self.vx)) self.gvtx.append(len(self.vx))
def getData(self, path): def getData(self, path):
result = None result = None
cnt = len(self.flatdoc) cnt = len(self.flatdoc)
for j in xrange(cnt): for j in xrange(cnt):
item = self.flatdoc[j] item = self.flatdoc[j]
if item.find('=') >= 0: if item.find('=') >= 0:
(name, argt) = item.split('=') (name, argt) = item.split('=')
argres = argt.split('|') argres = argt.split('|')
else: else:
name = item name = item
argres = [] argres = []
if (name == path): if (name == path):
result = argres result = argres
break break
if (len(argres) > 0) : if (len(argres) > 0) :
for j in xrange(0,len(argres)): for j in xrange(0,len(argres)):
argres[j] = int(argres[j]) argres[j] = int(argres[j])
return result return result
def getPath(self, gly): def getPath(self, gly):
path = '' path = ''
if (gly < 0) or (gly >= self.count): if (gly < 0) or (gly >= self.count):
return path return path
tx = self.vx[self.gvtx[gly]:self.gvtx[gly+1]-1] tx = self.vx[self.gvtx[gly]:self.gvtx[gly+1]-1]
ty = self.vy[self.gvtx[gly]:self.gvtx[gly+1]-1] ty = self.vy[self.gvtx[gly]:self.gvtx[gly+1]-1]
p = 0 p = 0
for k in xrange(self.glen[gly], self.glen[gly+1]): for k in xrange(self.glen[gly], self.glen[gly+1]):
if (p == 0): if (p == 0):
zx = tx[0:self.vlen[k]+1] zx = tx[0:self.vlen[k]+1]
zy = ty[0:self.vlen[k]+1] zy = ty[0:self.vlen[k]+1]
else: else:
zx = tx[self.vlen[k-1]+1:self.vlen[k]+1] zx = tx[self.vlen[k-1]+1:self.vlen[k]+1]
zy = ty[self.vlen[k-1]+1:self.vlen[k]+1] zy = ty[self.vlen[k-1]+1:self.vlen[k]+1]
p += 1 p += 1
for j in xrange(0, len(zx)): j = 0
if (j == 0): while ( j < len(zx) ):
path += 'M %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly]) if (j == 0):
else: # Start Position.
path += 'L %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly]) path += 'M %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly])
path += 'z' elif (j <= len(zx)-3):
return path # Cubic Bezier Curve
path += 'C %d %d %d %d %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly], zx[j+1] * self.dpi / self.gdpi[gly], zy[j+1] * self.dpi / self.gdpi[gly], zx[j+2] * self.dpi / self.gdpi[gly], zy[j+2] * self.dpi / self.gdpi[gly])
j += 2
elif (j == len(zx)-2):
# Cubic Bezier Curve to Start Position
path += 'C %d %d %d %d %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly], zx[j+1] * self.dpi / self.gdpi[gly], zy[j+1] * self.dpi / self.gdpi[gly], zx[0] * self.dpi / self.gdpi[gly], zy[0] * self.dpi / self.gdpi[gly])
j += 1
elif (j == len(zx)-1):
# Quadratic Bezier Curve to Start Position
path += 'Q %d %d %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly], zx[0] * self.dpi / self.gdpi[gly], zy[0] * self.dpi / self.gdpi[gly])
j += 1
path += 'z'
return path
class PParser(object): class PParser(object):
def __init__(self, flatxml): def __init__(self, flatxml):
self.flatdoc = flatxml.split('\n') self.flatdoc = flatxml.split('\n')
self.temp = [] self.temp = []
self.ph = self.getData('page.h')[0] foo = self.getData('page.h') or self.getData('book.h')
self.pw = self.getData('page.w')[0] self.ph = foo[0]
self.gx = self.getData('info.glyph.x') foo = self.getData('page.w') or self.getData('book.w')
self.gy = self.getData('info.glyph.y') self.pw = foo[0]
self.gid = self.getData('info.glyph.glyphID') self.gx = self.getData('info.glyph.x')
self.gy = self.getData('info.glyph.y')
self.gid = self.getData('info.glyph.glyphID')
def getData(self, path): def getData(self, path):
result = None result = None
cnt = len(self.flatdoc) cnt = len(self.flatdoc)
for j in xrange(cnt): for j in xrange(cnt):
item = self.flatdoc[j] item = self.flatdoc[j]
if item.find('=') >= 0: if item.find('=') >= 0:
(name, argt) = item.split('=') (name, argt) = item.split('=')
argres = argt.split('|') argres = argt.split('|')
else: else:
name = item name = item
argres = [] argres = []
if (name.endswith(path)): if (name.endswith(path)):
result = argres result = argres
break break
if (len(argres) > 0) : if (len(argres) > 0) :
for j in xrange(0,len(argres)): for j in xrange(0,len(argres)):
argres[j] = int(argres[j]) argres[j] = int(argres[j])
return result return result
def getDataTemp(self, path): def getDataTemp(self, path):
result = None result = None
cnt = len(self.temp) cnt = len(self.temp)
for j in xrange(cnt): for j in xrange(cnt):
item = self.temp[j] item = self.temp[j]
if item.find('=') >= 0: if item.find('=') >= 0:
(name, argt) = item.split('=') (name, argt) = item.split('=')
argres = argt.split('|') argres = argt.split('|')
else: else:
name = item name = item
argres = [] argres = []
if (name.endswith(path)): if (name.endswith(path)):
result = argres result = argres
self.temp.pop(j) self.temp.pop(j)
break break
if (len(argres) > 0) : if (len(argres) > 0) :
for j in xrange(0,len(argres)): for j in xrange(0,len(argres)):
argres[j] = int(argres[j]) argres[j] = int(argres[j])
return result return result
def getImages(self): def getImages(self):
result = [] result = []
self.temp = self.flatdoc self.temp = self.flatdoc
while (self.getDataTemp('region.img') != None): while (self.getDataTemp('img') != None):
h = self.getDataTemp('region.img.h')[0] h = self.getDataTemp('img.h')[0]
w = self.getDataTemp('region.img.w')[0] w = self.getDataTemp('img.w')[0]
x = self.getDataTemp('region.img.x')[0] x = self.getDataTemp('img.x')[0]
y = self.getDataTemp('region.img.y')[0] y = self.getDataTemp('img.y')[0]
src = self.getDataTemp('region.img.src')[0] src = self.getDataTemp('img.src')[0]
result.append('<image xlink:href="../img/img%04d.jpg" x="%d" y="%d" width="%d" height="%d" />\n' % (src, x, y, w, h)) result.append('<image xlink:href="../img/img%04d.jpg" x="%d" y="%d" width="%d" height="%d" />\n' % (src, x, y, w, h))
return result return result
def getGlyphs(self,glyfname): def getGlyphs(self,glyfname):
result = [] result = []
if (self.gid != None) and (len(self.gid) > 0): if (self.gid != None) and (len(self.gid) > 0):
glyphs = [] glyphs = []
for j in set(self.gid): for j in set(self.gid):
glyphs.append(j) glyphs.append(j)
glyphs.sort() glyphs.sort()
gfile = open(glyfname, 'r') gfile = open(glyfname, 'r')
j = 0 j = 0
while True : while True :
inp = gfile.readline() inp = gfile.readline()
if (inp == ''): if (inp == ''):
break break
id='id="gl%d"' % glyphs[j] id='id="gl%d"' % glyphs[j]
if (inp.find(id) > 0): if (inp.find(id) > 0):
result.append(inp) result.append(inp)
j += 1 j += 1
if (j == len(glyphs)): if (j == len(glyphs)):
break break
gfile.close() gfile.close()
return result return result
def usage(): def usage():
print 'Usage: ' print 'Usage: '
print ' ' print ' '
print ' gensvg.py unencryptedBookDir' print ' gensvg.py unencryptedBookDir'
print ' ' print ' '
def main(argv): def main(argv):
bookDir = '' bookDir = ''
if len(argv) == 0: if len(argv) == 0:
argv = sys.argv argv = sys.argv
else : else :
argv = argv.split() argv = argv.split()
try: try:
opts, args = getopt.getopt(argv[1:], "h:") opts, args = getopt.getopt(argv[1:], "h:")
except getopt.GetoptError, err: except getopt.GetoptError, err:
print str(err) print str(err)
usage() usage()
sys.exit(2) sys.exit(2)
if len(opts) == 0 and len(args) == 0 : if len(opts) == 0 and len(args) == 0 :
usage() usage()
sys.exit(2) sys.exit(2)
for o, a in opts: for o, a in opts:
if o =="-h": if o =="-h":
usage() usage()
sys.exit(0) sys.exit(0)
bookDir = args[0] bookDir = args[0]
if not os.path.exists(bookDir) : if not os.path.exists(bookDir) :
print "Can not find directory with unencrypted book" print "Can not find directory with unencrypted book"
sys.exit(-1) sys.exit(-1)
dictFile = os.path.join(bookDir,'dict0000.dat') dictFile = os.path.join(bookDir,'dict0000.dat')
if not os.path.exists(dictFile) : if not os.path.exists(dictFile) :
print "Can not find dict0000.dat file" print "Can not find dict0000.dat file"
sys.exit(-1) sys.exit(-1)
pageDir = os.path.join(bookDir,'page') pageDir = os.path.join(bookDir,'page')
if not os.path.exists(pageDir) : if not os.path.exists(pageDir) :
print "Can not find page directory in unencrypted book" print "Can not find page directory in unencrypted book"
sys.exit(-1) sys.exit(-1)
imgDir = os.path.join(bookDir,'img') imgDir = os.path.join(bookDir,'img')
if not os.path.exists(imgDir) : if not os.path.exists(imgDir) :
print "Can not find image directory in unencrypted book" print "Can not find image directory in unencrypted book"
sys.exit(-1) sys.exit(-1)
glyphsDir = os.path.join(bookDir,'glyphs') glyphsDir = os.path.join(bookDir,'glyphs')
if not os.path.exists(glyphsDir) : if not os.path.exists(glyphsDir) :
print "Can not find glyphs directory in unencrypted book" print "Can not find glyphs directory in unencrypted book"
sys.exit(-1) sys.exit(-1)
metaFile = os.path.join(bookDir,'metadata0000.dat') metaFile = os.path.join(bookDir,'metadata0000.dat')
if not os.path.exists(metaFile) : if not os.path.exists(metaFile) :
print "Can not find metadata0000.dat in unencrypted book" print "Can not find metadata0000.dat in unencrypted book"
sys.exit(-1) sys.exit(-1)
svgDir = os.path.join(bookDir,'svg') svgDir = os.path.join(bookDir,'svg')
if not os.path.exists(svgDir) : if not os.path.exists(svgDir) :
os.makedirs(svgDir) os.makedirs(svgDir)
print 'Processing Meta Data ... ' print 'Processing Meta Data ... '
print ' ', 'metadata0000.dat' print ' ', 'metadata0000.dat'
fname = os.path.join(bookDir,'metadata0000.dat') fname = os.path.join(bookDir,'metadata0000.dat')
metadata = decode_meta.getMetaArray(fname) metadata = decode_meta.getMetaArray(fname)
print 'Processing Glyphs ... ' print 'Processing Glyphs ... '
filenames = os.listdir(glyphsDir) filenames = os.listdir(glyphsDir)
filenames = sorted(filenames) filenames = sorted(filenames)
glyfname = os.path.join(svgDir,'glyphs.svg') glyfname = os.path.join(svgDir,'glyphs.svg')
glyfile = open(glyfname, 'w') glyfile = open(glyfname, 'w')
glyfile.write('<?xml version="1.0" standalone="no"?>\n') glyfile.write('<?xml version="1.0" standalone="no"?>\n')
glyfile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n') glyfile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n')
glyfile.write('<svg width="512" height="512" viewBox="0 0 511 511" xmlns="http://www.w3.org/2000/svg" version="1.1">\n') glyfile.write('<svg width="512" height="512" viewBox="0 0 511 511" xmlns="http://www.w3.org/2000/svg" version="1.1">\n')
glyfile.write('<title>Glyphs for %s</title>\n' % metadata['Title']) glyfile.write('<title>Glyphs for %s</title>\n' % metadata['Title'])
glyfile.write('<defs>\n') glyfile.write('<defs>\n')
counter = 0 counter = 0
for filename in filenames: for filename in filenames:
print ' ', filename print ' ', filename
fname = os.path.join(glyphsDir,filename) fname = os.path.join(glyphsDir,filename)
flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname) flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
gp = GParser(flat_xml) gp = GParser(flat_xml)
for i in xrange(0, gp.count): for i in xrange(0, gp.count):
path = gp.getPath(i) path = gp.getPath(i)
glyfile.write('<path id="gl%d" d="%s" fill="black" />\n' % (counter * 256 + i, path)) glyfile.write('<path id="gl%d" d="%s" fill="black" />\n' % (counter * 256 + i, path))
counter += 1 counter += 1
glyfile.write('</defs>\n') glyfile.write('</defs>\n')
glyfile.write('</svg>\n') glyfile.write('</svg>\n')
glyfile.close() glyfile.close()
print 'Processing Pages ... ' print 'Processing Pages ... '
scaledpi = 720 scaledpi = 720
filenames = os.listdir(pageDir) filenames = os.listdir(pageDir)
filenames = sorted(filenames) filenames = sorted(filenames)
counter = 0 counter = 0
for filename in filenames: for filename in filenames:
print ' ', filename print ' ', filename
fname = os.path.join(pageDir,filename) fname = os.path.join(pageDir,filename)
flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname) flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
pp = PParser(flat_xml) pp = PParser(flat_xml)
pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w') pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w')
pfile.write('<?xml version="1.0" standalone="no"?>\n') pfile.write('<?xml version="1.0" standalone="no"?>\n')
pfile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n') pfile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n')
pfile.write('<svg width="%fin" height="%fin" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1)) pfile.write('<svg width="%fin" height="%fin" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1))
pfile.write('<title>Page %d - %s by %s</title>\n' % (counter, metadata['Title'],metadata['Authors'])) pfile.write('<title>Page %d - %s by %s</title>\n' % (counter, metadata['Title'],metadata['Authors']))
if (pp.gid != None): if (pp.gid != None):
pfile.write('<defs>\n') pfile.write('<defs>\n')
gdefs = pp.getGlyphs(glyfname) gdefs = pp.getGlyphs(glyfname)
for j in xrange(0,len(gdefs)): for j in xrange(0,len(gdefs)):
pfile.write(gdefs[j]) pfile.write(gdefs[j])
pfile.write('</defs>\n') pfile.write('</defs>\n')
for j in xrange(0,len(pp.gid)): for j in xrange(0,len(pp.gid)):
pfile.write('<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (pp.gid[j], pp.gx[j], pp.gy[j])) pfile.write('<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (pp.gid[j], pp.gx[j], pp.gy[j]))
img = pp.getImages() img = pp.getImages()
if (img != None): if (img != None):
for j in xrange(0,len(img)): for j in xrange(0,len(img)):
pfile.write(img[j]) pfile.write(img[j])
pfile.write('</svg>') pfile.write('</svg>')
pfile.close() pfile.close()
counter += 1 counter += 1
print 'Processing Complete' print 'Processing Complete'
return 0 return 0
if __name__ == '__main__': if __name__ == '__main__':
sys.exit(main('')) sys.exit(main(''))

View File

@ -1,3 +1,13 @@
Contributors:
cmbtc - removal of drm which made all of this possible
clarknova - for all of the svg and glyph generation and many other bug fixes and improvements
skindle - for figuing out the general case for the mode loops
some updates - for conversion to xml, basic html
DiapDealer - for extensive testing and feeback
and others for posting, feedback and testing
This is experimental and it will probably not work for you but... This is experimental and it will probably not work for you but...
ALSO: Please do not use any of this to steal. Theft is wrong. ALSO: Please do not use any of this to steal. Theft is wrong.

View File

@ -11,8 +11,9 @@ from struct import unpack
class DocParser(object): class DocParser(object):
def __init__(self, flatxml): def __init__(self, flatxml, fontsize):
self.flatdoc = flatxml.split('\n') self.flatdoc = flatxml.split('\n')
self.fontsize = int(fontsize)
stags = { stags = {
'paragraph' : 'p', 'paragraph' : 'p',
@ -20,14 +21,14 @@ class DocParser(object):
} }
attr_val_map = { attr_val_map = {
'hang' : ('text-indent: ', 135), 'hang' : 'text-indent: ',
'indent' : ('text-indent: ', 135), 'indent' : 'text-indent: ',
'line-space' : ('line-height: ', 190), 'line-space' : 'line-height: ',
'margin-bottom' : ('margin-bottom: ', 135), 'margin-bottom' : 'margin-bottom: ',
'margin-left' : ('margin-left: ', 135), 'margin-left' : 'margin-left: ',
'margin-right' : ('margin-right: ', 135), 'margin-right' : 'margin-right: ',
'margin-top' : ('margin-top: ', 135), 'margin-top' : 'margin-top: ',
'space-after' : ('padding-bottom: ', 135), 'space-after' : 'padding-bottom: ',
} }
attr_str_map = { attr_str_map = {
@ -55,7 +56,7 @@ class DocParser(object):
for j in xrange(pos, end): for j in xrange(pos, end):
item = docList[j] item = docList[j]
if item.find('=') >= 0: if item.find('=') >= 0:
(name, argres) = item.split('=') (name, argres) = item.split('=',1)
else : else :
name = item name = item
argres = '' argres = ''
@ -81,6 +82,7 @@ class DocParser(object):
def process(self): def process(self):
classlst = ''
csspage = '' csspage = ''
# generate a list of each <style> starting point in the stylesheet # generate a list of each <style> starting point in the stylesheet
@ -132,23 +134,19 @@ class DocParser(object):
else : else :
# handle value based attributes # handle value based attributes
if attr in self.attr_val_map : if attr in self.attr_val_map :
(name, scale) = self.attr_val_map[attr] name = self.attr_val_map[attr]
scale = self.fontsize
if attr == 'line-space': scale = scale * 1.41
if not ((attr == 'hang') and (int(val) == 0)) : if not ((attr == 'hang') and (int(val) == 0)) :
ems = int(val)/scale ems = int(val)/scale
cssargs[attr] = (self.attr_val_map[attr][0], ems) cssargs[attr] = (self.attr_val_map[attr], ems)
keep = True keep = True
start = pos + 1 start = pos + 1
# disable all of the after class tags until I figure out how to handle them # disable all of the after class tags until I figure out how to handle them
# remove all numerals after the "reclustered"
if aftclass != "" : keep = False if aftclass != "" : keep = False
p = sclass.find('reclustered')
if p >= 0:
sclass = sclass[0:p+11]
if keep : if keep :
# make sure line-space does not go below 1em # make sure line-space does not go below 1em
if 'line-space' in cssargs: if 'line-space' in cssargs:
@ -156,7 +154,7 @@ class DocParser(object):
val = cssargs['line-space'][1] val = cssargs['line-space'][1]
if val < 1.0: val = 1.0 if val < 1.0: val = 1.0
del cssargs['line-space'] del cssargs['line-space']
cssargs['line-space'] = (self.attr_val_map['line-space'][0], val) cssargs['line-space'] = (self.attr_val_map['line-space'], val)
@ -165,7 +163,7 @@ class DocParser(object):
hseg = cssargs['hang'][0] hseg = cssargs['hang'][0]
hval = cssargs['hang'][1] hval = cssargs['hang'][1]
del cssargs['hang'] del cssargs['hang']
cssargs['hang'] = (self.attr_val_map['hang'][0], -hval) cssargs['hang'] = (self.attr_val_map['hang'], -hval)
mval = 0 mval = 0
mseg = 'margin-left: ' mseg = 'margin-left: '
if 'margin-left' in cssargs: if 'margin-left' in cssargs:
@ -188,6 +186,9 @@ class DocParser(object):
cssline += '}' cssline += '}'
if sclass != '' :
classlst += sclass + '\n'
# handle special case of paragraph class used inside chapter heading # handle special case of paragraph class used inside chapter heading
# and non-chapter headings # and non-chapter headings
if sclass != '' : if sclass != '' :
@ -207,14 +208,15 @@ class DocParser(object):
csspage += self.stags[tag] + cssline + '\n' csspage += self.stags[tag] + cssline + '\n'
return csspage
return csspage, classlst
def convert2CSS(flatxml): def convert2CSS(flatxml, fontsize):
# create a document parser # create a document parser
dp = DocParser(flatxml) dp = DocParser(flatxml, fontsize)
csspage = dp.process() csspage = dp.process()