DeDRM_tools/eReader_Tools/lib/xpml2xhtml.py
Apprentice Alf 93c2ccd2c2 tools v1.0
(With some additions)
Lots of authors, brought together by Apprentice Alf.
2015-02-28 14:14:39 +00:00

857 lines
36 KiB
Python

#!/bin/env python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
#
# xPml2XHtml.py
#
# This is a python script. You need a Python interpreter to run it.
# For example, ActiveState Python, which exists for windows.
#
# Based on Code, Input and Ideas from:
# The Dark Reverser (original author)
# Kevin Hendricks
# Logan Kennelly
# John Schember (Calibre project)
# WayneD's (perl pml2html.pl)
# Changelog
# 0.02 - tried to greatly improve html conversion especially with \t tags
# 0.03 - more cleanup, a few fixes, and add in use of tidy to output xhtml
# 0.04 - incorporate more cleanups
# 0.05 - add check to fix block elements nested in inline elements which are not allowed
# 0.07 - handle clean up for remains left over from fixing nesting issues rampant in pml
# 0.08 - deal with inline style tags nesting issues in new way using a style tag list
# 0.09 - add in support for wrapping all text not in a block in <p></p> tags
# 0.10 - treat links effectively as block elements for style markup
# 0.11 - add in various paragraphs indentations to handle leading spaces that html would ignore or compress
# 0.12 - add in support for handling xml based pml footnotes and sidebars - using pseudo pml tags
# 0.14 - add in full header info parsing and remove need for bookinfo.txt
# 0.15 - cleanup high chars better handled, optional use of tidy with command line switch
# 0.16 - use proper and safe temporary file when passing things to tidy
# 0.17 - add support for tidy.exe under windows
# 0.18 - fix corner case of lines that start with \axxx or \Uxxxx tags
# 0.19 - change to use auto flushed stdout, and use proper return values
__version__='0.19'
class Unbuffered:
def __init__(self, stream):
self.stream = stream
def write(self, data):
self.stream.write(data)
self.stream.flush()
def __getattr__(self, attr):
return getattr(self.stream, attr)
import sys
sys.stdout=Unbuffered(sys.stdout)
import struct, binascii, zlib, os, getopt, os.path, urllib, re, tempfile
import logging
from subprocess import Popen, PIPE, STDOUT
logging.basicConfig()
#logging.basicConfig(level=logging.DEBUG)
class PmlConverter(object):
def __init__(self, s):
def cleanupHighChars(src):
# special win1252 chars 0x80 - 0xa0 properly handled
src = re.sub('[\x80-\xff]', lambda x: '\\a%03d' % ord(x.group()), src)
src = re.sub('[^\x00-\xff]', lambda x: '\\U%04x' % ord(x.group()), src)
return src
def convertFootnoteXMLtoPseudoPML(src):
# creates pseudo tag \Ft="id"footnote text\Ft
p = re.compile(r'<footnote id="(?P<label>[^"]+)">\n')
m = p.search(src)
while m:
(b, e) = m.span()
fid = m.groups('label')[0]
src = src[0:b] + '\\p\\Ft="' + fid + '"' + src[e:]
src = re.sub('\n</footnote>\n','\\\\Ft\n\n',src,1)
m = p.search(src)
return src
def convertSidebarXMLtoPseudoPML(src):
# creates pseudo tag \St="id"sidebar text\St
p = re.compile(r'<sidebar id="(?P<label>[^"]+)">\n')
m = p.search(src)
while m:
(b, e) = m.span()
sid = m.groups('label')[0]
src = src[0:b] + '\\p\\St="' + sid + '"' + src[e:]
src = re.sub('\n</sidebar>\n','\\\\St\n\n',src,1)
m = p.search(src)
return src
def convert_x_to_pX0(src):
# converts all \x \x to \p\X0 \X0 make later code simpler
p = re.compile(r'\\x(.*?)\\x')
m = p.search(src)
while m:
(b, e) = m.span()
src = src[0:b] + '\\p\\X0' + src[b+2:e-2] + '\\X0' + src[e:]
m = p.search(src)
return src
def findPrevStartofLine(src,p,n):
# find last end of previous line in substring from p to n
b1 = src.rfind('\n',p,n)
b2 = src.rfind('\\c',p,n)
b3 = src.rfind('\\r',p,n)
b4 = src.rfind('\\x',p,n)
b5 = src.rfind('\\p',p,n)
b = max(b1, b2, b3, b4, b5)
if b == -1:
return n
if b == b1:
return b + 1
return b + 2
def markHangingIndents(src):
r = ''
p = 0
while True:
if p > len(src):
return r
n = src.find('\\t', p)
if n == -1:
r += src[p:]
return r
pc = findPrevStartofLine(src,p,n)
if pc == n :
# \t tag is at start of line so indent block will work
end = src.find('\\t',n+2)
if end == -1:
end = n
r += src[p:end+2]
p = end + 2
else :
# \t tag not at start of line so hanging indent case
# recode \t to pseudo \h tags and move it to start of this line
# and recode its close as well
r += src[p:pc] + '\\h' + src[pc:n]
end = src.find('\\t',n+2)
if end == -1:
end = n+2
r += src[n+2:end] + '\\h'
p = end + 2
return r
# recode double single slashes in pml to allow easier regular expression usage
s = s.replace('\\\\','_amp#92_')
s = cleanupHighChars(s)
s = markHangingIndents(s)
s = convertFootnoteXMLtoPseudoPML(s)
s = convertSidebarXMLtoPseudoPML(s)
s = convert_x_to_pX0(s)
# file('converted.pml','wb').write(s)
self.s = s
self.pos = 0
self.markChapters = (s.find('\\X') == -1)
def headerInfo(self):
title, author, copyright, publisher, eisbn = None, None, None, None, None
m = re.search(r'\\v.*TITLE="(?P<value>[^"]+)".*\\v', self.s, re.DOTALL)
if m:
title = m.groups('value')[0]
m = re.search(r'\\v.*AUTHOR="(?P<value>[^"]+)".*\\v', self.s, re.DOTALL)
if m:
author = m.groups('value')[0]
m = re.search(r'\\v.*PUBLISHER="(?P<value>[^"]+)".*\\v', self.s, re.DOTALL)
if m:
publisher = m.groups('value')[0]
m = re.search(r'\\v.*EISBN="(?P<value>[^"]+)".*\\v', self.s, re.DOTALL)
if m:
eisbn = m.groups('value')[0]
m = re.search(r'\\v.*COPYRIGHT="(?P<value>[^"]+)".*\\v', self.s, re.DOTALL)
if m:
copyright = m.groups('value')[0]
return title, author, copyright, publisher, eisbn
def nextOptAttr(self):
p = self.pos
if self.s[p:p+2] != '="':
return None
r = ''
p += 2
while self.s[p] != '"':
r += self.s[p]
p += 1
self.pos = p + 1
return r
def skipNewLine(self):
p = self.pos
if p >= len(self.s):
return
if self.s[p] == '\n':
self.pos = p + 1
return
def next(self):
p = self.pos
if p >= len(self.s):
return None
if self.s[p] != '\\':
res = self.s.find('\\', p)
if res == -1:
res = len(self.s)
self.pos = res
return self.s[p : res], None, None
c = self.s[p+1]
# add in support for new pseudo tag \\h
if c in 'pxcriuovthnsblBk-lI\\d':
self.pos = p + 2
return None, c, None
if c in 'TwmqQ':
self.pos = p + 2
return None, c, self.nextOptAttr()
if c == 'a':
self.pos = p + 5
return None, c, int(self.s[p+2:p+5])
if c == 'U':
self.pos = p + 6
return None, c, int(self.s[p+2:p+6], 16)
c = self.s[p+1:p+1+2]
if c in ('X0','X1','X2','X3','X4','Sp','Sb'):
self.pos = p + 3
return None, c, None
# add in support for new pseudo tags Ft and St
if c in ('C0','C1','C2','C3','C4','Fn','Sd', 'Ft', 'St'):
self.pos = p + 3
return None, c, self.nextOptAttr()
print "unknown escape code %s" % c
self.pos = p + 1
return None, None, None
def LinkPrinter(link):
return '<a href="%s">' % link
# for every footnote provide a unique id (built on footnote unique id)
# so that a hyperlink return is possibly
def FootnoteLinkPrinter(link):
rlink = 'return_' + link
footnote_ids[link] = rlink
return '<a id="%s" href="#%s">' % (rlink, link)
def Footnote(link):
return '<div title="footnote" id="%s">' % link
def EndFootnote(link):
return '&nbsp;&nbsp;<a href="#%s"><small>return</small></a></div>' % footnote_ids[link]
# for every sidebar provide a unique id (built from sidebar unique id)
# so that a hyperlink return is possibly
def SidebarLinkPrinter(link):
rlink = 'return_' + link
sidebar_ids[link] = rlink
return '<a id="%s" href="#%s">' % (rlink, link)
def Sidebar(link):
return '<div title="sidebar" id="%s">' % link
def EndSidebar(link):
return '&nbsp;&nbsp;<a href="#%s"><small>return</small></a></div>' % sidebar_ids[link]
# standard font switch is used mainly for special chars that may not be in user fonts
# but since these special chars are html encoded neither of these are needed
# def NormalFont(link):
# return '<!-- NormalFont -->%s' %link
# def EndNormalFont(link):
# return '<!-- EndNormalFont -->'
# def StdFont(link):
# return '<!-- StdFont -->%s' %link
# def EndStdFont(link):
# return '<!-- EndStdFont -->'
# See http://wiki.mobileread.com/wiki/PML#Palm_Markup_Language
# html non-style related beg and end tags
html_tags = {
'v' : ('<!-- ', ' -->'),
'c' : ('\n<div class="center">', '</div>\n'),
'r' : ('\n<div class="right">', '</div>\n'),
't' : ('<div class="indent">','</div>\n'),
'h' : ('<div class="hang">','</div>\n'), # pseudo-tag created to handle hanging indent cases
'X0' : ('<h1>', '</h1>\n'),
'X1' : ('<h2>', '</h2>\n'),
'X2' : ('<h3>', '</h3>\n'),
'X3' : ('<h4>', '</h4>\n'),
'X4' : ('<h5>', '</h5>\n'),
'q' : (LinkPrinter, '</a>'),
'Fn' : (FootnoteLinkPrinter, '</a>'),
'Sd' : (SidebarLinkPrinter, '</a>'),
'Ft' : (Footnote, EndFootnote),
'St' : (Sidebar, EndSidebar),
'I' : ('<i>', '</i>'),
'P' : ('<p>', '</p>\n'), # pseudo tag indicating a paragraph (imputed from pml file contents)
#'x' : ('<div class="breakafter"></div><h1>', '</h1>\n'), handled via recoding
}
# html style related beg and end tags
html_style_tags = {
'i' : ('<i>', '</i>'),
'u' : ('<span class="under">', '</span>'),
'b' : ('<b>', '</b>'),
'B' : ('<b>', '</b>'),
'o' : ('<del>', '</del>'),
'v' : ('<!-- ', ' -->'),
'Sb' : ('<sub>', '</sub>'),
'Sp' : ('<sup>', '</sup>'),
'l' : ('<span class="big">', '</span>'),
'k' : ('<span class="smallcaps">', '</span>'),
'I' : ('<i>', '</i>'), # according to calibre - all ereader does is italicize the index entries
'l' : ('<span class="big">', '</span>'),
'k' : ('<span class="smallcaps">', '</span>'),
#'n' : (NormalFont, EndNormalFont), handle as a single and strip out to prevent undesired mid word breaks
#'s' : (StdFont, EndStdFont), handle as a single and strip out to prevent undesired mid word breaks
}
# single tags (non-paired) that require no arguments
html_one_tags = {
#'p' : '<div class="breakafter"></div>\n', handle them in the if block to create at body level
#'\\': '\\', handled via recoding
'-' : '&shy;',
's' : '', # strip out see earlier note on standard and normla font use
'n' : '', # strip out see earlier note on standard and normla font use
}
# single tags that are not paired but that require attribute an argument
#'w' : handled in the if block,
#'m' : handled in if block,
#'Q' : handled in if block,
#'a' : handled in if block,
#'U' : handled in if block,
#'C0' : handled in if block,
#'C1' : handled in if block,
#'C2' : handled in if block,
#'C3' : handled in if block,
#'C4' : handled in if block,
#'T' : handled in if block,
html_block_tags = ('c','r','t','h','X0','X1','X2','X3','X4','x','P', 'Ft', 'St')
html_link_tags = ('q','Fn','Sd')
html_comment_tags = ('v')
pml_chars = {
128 : '&#8364;', 129 : '' , 130 : '&#8212;', 131 : '&#402;' , 132 : '&#8222;',
133 : '&#8230;', 134 : '&#8224;', 135 : '&#8225;', 136 : '&#710;' , 137 : '&#8240;',
138 : '&#352;' , 139 : '&#8249;', 140 : '&#338;' , 141 : '' , 142 : '&#381;' ,
143 : '' , 144 : '' , 145 : '&#8216;', 146 : '&#8217;', 147 : '&#8220;',
148 : '&#8221;', 149 : '&#8226;', 150 : '&#8211;', 151 : '&#8212;', 152 : '' ,
153 : '&#8482;', 154 : '&#353;' , 155 : '&#8250;', 156 : '&#339;' , 157 : '' ,
158 : '&#382;' , 159 : '&#376;' , 160 : '&nbsp;' ,
}
def process(self):
lastbreaksize = 0
final = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n'
final += '<html>\n<head>\n'
final += '<meta http-equiv="content-type" content="text/html; charset=windows-1252"/>\n'
title, author, copyright, publisher, eisbn = self.headerInfo()
if not title: title = bookname
if not author: author = 'Unknown'
final += '<title>%s by %s</title>\n' % (title, author)
final += '<meta name="Title" content="%s"/>\n' % title
final += '<meta name="Author" content="%s"/>\n' % author
if copyright: final += '<meta name="Copyright" content="%s"/>\n' % copyright
if publisher: final += '<meta name="Publisher" content="%s"/>\n' % publisher
if eisbn: final += '<meta name="EISBN" content="%s"/>\n' % eisbn
final += '<style type="text/css">\n'
final += 'div.center { text-align:center; }\n'
final += 'div.right { text-align:right; }\n'
final += 'div.indent { margin-left: 5%; margin-right: 5%; }\n'
final += 'div.hang { text-indent: -5%; margin-left: 5%; margin-right: 5%; }\n'
final += 'span.big { font-size: 175%; }\n'
final += 'span.smallcaps { font-size: 80%; font-variant: small-caps; }\n'
final += 'span.under { text-decoration: underline; }\n'
final += '.breakafter { page-break-after: always; }\n'
final += 'p { text-indent: 0; margin-top: 0; margin-bottom: 0; }\n'
final += 'p.i0 { text-indent: 0; margin-top: 0; margin-bottom: 0; }\n'
final += 'p.i1 { text-indent: 1%; margin-top: 0; margin-bottom: 0; }\n'
final += 'p.i2 { text-indent: 2%; margin-top: 0; margin-bottom: 0; }\n'
final += 'p.i3 { text-indent: 3%; margin-top: 0; margin-bottom: 0; }\n'
final += 'p.i4 { text-indent: 4%; margin-top: 0; margin-bottom: 0; }\n'
final += 'p.i5 { text-indent: 5%; margin-top: 0; margin-bottom: 0; }\n'
final += '</style>\n'
final += '</head>\n<body>\n'
in_tags = []
st_tags = []
def inSet(slist):
rval = False
j = len(in_tags)
if j == 0:
return False
while True:
j = j - 1
if in_tags[j][0] in slist:
rval = True
break
if j == 0:
break
return rval
def inBlock():
return inSet(self.html_block_tags)
def inLink():
return inSet(self.html_link_tags)
def inComment():
return inSet(self.html_comment_tags)
def inParaNow():
j = len(in_tags)
if j == 0:
return False
if in_tags[j-1][0] == 'P':
return True
return False
def getTag(ti, end):
cmd, attr = ti
r = self.html_tags[cmd][end]
if type(r) != str:
r = r(attr)
return r
def getSTag(ti, end):
cmd, attr = ti
r = self.html_style_tags[cmd][end]
if type(r) != str:
r = r(attr)
return r
def applyStyles(ending):
s = ''
j = len(st_tags)
if j > 0:
if ending:
while True:
j = j - 1
s += getSTag(st_tags[j], True)
if j == 0:
break
else:
k = 0
while True:
s += getSTag(st_tags[k], False)
k = k + 1
if k == j:
break
return s
def indentLevel(line_start):
nb = 0
while line_start[nb:nb+1] == ' ':
nb = nb + 1
line_start = line_start[nb:]
if nb > 5:
nb = 5
return nb, line_start
def makeText(s):
# handle replacements required for html
s = s.replace('&', '&amp;')
s = s.replace('<', '&lt;')
s = s.replace('>', '&gt;')
return_s =''
# parse the text line by line
lp = s.find('\n')
while lp != -1:
line = s[0:lp]
s = s[lp+1:]
if not inBlock() and not inLink() and not inComment():
if len(line) > 0:
# text should not exist in the <body> tag level unless it is in a comment
nb, line = indentLevel(line)
return_s += '<p class="i%d">' % nb
return_s += applyStyles(False)
return_s += line
return_s += applyStyles(True)
return_s += '</p>\n'
else:
return_s += '<p>&nbsp;</p>\n'
elif inParaNow():
# text is a continuation of a previously started paragraph
return_s += line
return_s += applyStyles(True)
return_s += '</p>\n'
j = len(in_tags)
del in_tags[j-1]
else:
if len(line) > 0:
return_s += line + '<br />\n'
else:
return_s += '<br />\n'
lp = s.find('\n')
linefrag = s
if len(linefrag) > 0:
if not inBlock() and not inLink() and not inComment():
nb, linefrag = indentLevel(linefrag)
return_s += '<p class="i%d">' % nb
return_s += applyStyles(False)
return_s += linefrag
ppair = ('P', None)
in_tags.append(ppair)
else:
return_s += linefrag
return return_s
while True:
r = self.next()
if not r:
break
text, cmd, attr = r
if text:
final += makeText(text)
if cmd:
# handle pseudo paragraph P tags
# close if starting a new block element
if cmd in self.html_block_tags or cmd == 'w':
j = len(in_tags)
if j > 0:
if in_tags[j-1][0] == 'P':
final += applyStyles(True)
final += getTag(in_tags[j-1],True)
del in_tags[j-1]
if cmd in self.html_block_tags:
pair = (cmd, attr)
if cmd not in [a for (a,b) in in_tags]:
# starting a new block tag
final += getTag(pair, False)
final += applyStyles(False)
in_tags.append(pair)
else:
# process ending tag for a tag pair
# ending tag should be for the most recently added start tag
j = len(in_tags)
if cmd == in_tags[j-1][0]:
final += applyStyles(True)
final += getTag(in_tags[j-1], True)
del in_tags[j-1]
else:
# ow: things are not properly nested
# process ending tag for block
# ending tag **should** be for the most recently added block tag
# but in too many cases it is not so we must fix this by
# closing all open tags up to the current one and then
# reopen all of the tags we had to close due to improper nesting of styles
print 'Warning: Improperly Nested Block Tags: expected %s found %s' % (cmd, in_tags[j-1][0])
print 'after processing %s' % final[-40:]
j = len(in_tags)
while True:
j = j - 1
final += applyStyles(True)
final += getTag(in_tags[j], True)
if in_tags[j][0] == cmd:
break
del in_tags[j]
# now create new block start tags if they were previously open
while j < len(st_tags):
final += getTag(in_tags[j], False)
final += applyStyles(False)
j = j + 1
self.skipNewLine()
elif cmd in self.html_link_tags:
pair = (cmd, attr)
if cmd not in [a for (a,b) in in_tags]:
# starting a new link tag
# first close out any still open styles
if inBlock():
final += applyStyles(True)
# output start tag and styles needed
final += getTag(pair, False)
final += applyStyles(False)
in_tags.append(pair)
else:
# process ending tag for a tag pair
# ending tag should be for the most recently added start tag
j = len(in_tags)
if cmd == in_tags[j-1][0]:
j = len(in_tags)
# apply closing styles and tag
final += applyStyles(True)
final += getTag(in_tags[j-1], True)
# if needed reopen any style tags
if inBlock():
final += applyStyles(False)
del in_tags[j-1]
else:
# ow: things are not properly nested
print 'Error: Improperly Nested Link Tags: expected %s found %s' % (cmd, in_tags[j-1][0])
print 'after processing %s' % final[-40:]
elif cmd in self.html_style_tags:
spair = (cmd, attr)
if cmd not in [a for (a,b) in st_tags]:
# starting a new style
if inBlock() or inLink():
final += getSTag(spair,False)
st_tags.append(spair)
else:
# process ending tag for style
# ending tag **should** be for the most recently added style tag
# but in too many cases it is not so we must fix this by
# closing all open tags up to the current one and then
# reopen all of the tags we had to close due to improper nesting of styles
j = len(st_tags)
while True:
j = j - 1
if inBlock() or inLink():
final += getSTag(st_tags[j], True)
if st_tags[j][0] == cmd:
break
del st_tags[j]
# now create new style start tags if they were previously open
while j < len(st_tags):
if inBlock() or inLink():
final += getSTag(st_tags[j], False)
j = j + 1
elif cmd in self.html_one_tags:
final += self.html_one_tags[cmd]
elif cmd == 'p':
# create page breaks at the <body> level so
# they can be easily used for safe html file segmentation breakpoints
# first close any open tags
j = len(in_tags)
if j > 0:
while True:
j = j - 1
if in_tags[j][0] in self.html_block_tags:
final += applyStyles(True)
final += getTag(in_tags[j], True)
if j == 0:
break
# insert the page break tag
final += '\n<div class="breakafter"></div>\n'
if sigil_breaks:
if (len(final) - lastbreaksize) > 3000:
final += '<div>\n <hr class="sigilChapterBreak" />\n</div>\n'
lastbreaksize = len(final)
# now create new start tags for all tags that
# were previously open
while j < len(in_tags):
final += getTag(in_tags[j], False)
if in_tags[j][0] in self.html_block_tags:
final += applyStyles(False)
j = j + 1
self.skipNewLine()
elif cmd[0:1] == 'C':
if self.markChapters:
# create toc entries at the <body> level
# since they will be in an invisible block
# first close any open tags
j = len(in_tags)
if j > 0:
while True:
j = j - 1
if in_tags[j][0] in self.html_block_tags:
final += applyStyles(True)
final += getTag(in_tags[j], True)
if j == 0:
break
level = int(cmd[1:2]) + 1
final += '<h%d title="%s"></h%d>' % (level, attr, level)
# now create new start tags for all tags that
# were previously open
while j < len(in_tags):
final += getTag(in_tags[j], False)
if in_tags[j][0] in self.html_block_tags:
final += applyStyles(False)
j = j + 1
else:
final += '<!-- ToC%s: %s -->' % (cmd[1:2], attr)
# now handle single tags (non-paired) that have attributes
elif cmd == 'm':
unquotedimagepath = bookname + '_img/' + attr
imagepath = urllib.quote( unquotedimagepath )
final += '<img src="%s" alt="" />' % imagepath
elif cmd == 'Q':
final += '<span id="%s"> </span>' % attr
elif cmd == 'a':
if not inBlock() and not inLink() and not inComment():
final += '<p class="i0">'
final += applyStyles(False)
final += self.pml_chars.get(attr, '&#%d;' % attr)
ppair = ('P', None)
in_tags.append(ppair)
else:
final += self.pml_chars.get(attr, '&#%d;' % attr)
elif cmd == 'U':
if not inBlock() and not inLink() and not inComment():
final += '<p class="i0">'
final += applyStyles(False)
final += '&#%d;' % attr
ppair = ('P', None)
in_tags.append(ppair)
else:
final += makeText('&#%d;' % attr)
elif cmd == 'w':
# hr width and align parameters are not allowed in strict xhtml but style widths are possible
final += '\n<hr style="width: %s;" />' % attr
# final += '<div style="width: %s; margin-left: auto; margin-right: auto; \
# border-top-style: solid; border-top-color: grey; border-top-width: thin;">&nbsp;</div>' % attr
self.skipNewLine()
elif cmd == 'T':
if inBlock():
final += '<span style="margin-left: %s;">&nbsp;</span>' % attr
else:
final += '<p style="text-indent: %s;">' % attr
final += applyStyles(False)
ppair = ('P', None)
in_tags.append(ppair)
else:
logging.warning("Unknown tag: %s-%s", cmd, attr)
# handle file ending condition for imputed P tags
j = len(in_tags)
if (j > 0):
if in_tags[j-1][0] == 'P':
final += '</p>'
final += '</body>\n</html>\n'
# recode html back to a single slash
final = final.replace('_amp#92_', '\\')
# cleanup the html code for issues specifically generated by this translation process
# ending divs already break the line at the end so we don't need the <br /> we added
final = final.replace('</div><br />\n','</div>\n')
# clean up empty elements that can be created when fixing improperly nested pml tags
# and by moving page break tags to the body level so that they can be used as html file split points
while True:
s = final
final = final.replace('<b></b>','')
final = final.replace('<i></i>','')
final = final.replace('<del></del>','')
final = final.replace('<sup></sup>','')
final = final.replace('<sub></sub>','')
final = final.replace('<span class="under"></span>','')
final = final.replace('<span class="big"></span>','')
final = final.replace('<span class="smallcaps"></span>','')
final = final.replace('<span class="under"> </span>','')
final = final.replace('<span class="big"> </span>','')
final = final.replace('<span class="smallcaps"> </span>','')
final = final.replace('<p></p>','')
final = final.replace('<p> </p>','')
final = final.replace('<p class="i0"></p>','')
final = final.replace('<p class="i1"></p>','')
final = final.replace('<p class="i2"></p>','')
final = final.replace('<p class="i3"></p>','')
final = final.replace('<p class="i4"></p>','')
final = final.replace('<p class="i5"></p>','')
final = final.replace('<h1></h1>\n','')
final = final.replace('<h2></h2>\n','')
final = final.replace('<h3></h3>\n','')
final = final.replace('<h4></h4>\n','')
final = final.replace('<h5></h5>\n','')
final = final.replace('<div class="center"></div>\n','')
final = final.replace('<div class="hang"></div>\n','')
final = final.replace('<div class="indent"></div>\n','')
final = final.replace('<div class="right"></div>\n','')
if s == final:
break
return final
def tidy(rawhtmlfile):
# processes rawhtmlfile through command line tidy via pipes
rawfobj = file(rawhtmlfile,'rb')
# --doctype strict forces strict dtd checking
# --enclose-text yes - enclosees non-block electment text inside <body></body> into its own <p></p> block to meet xhtml spec
# -w 100 -i will wrap text at column 120 and indent it to indicate level of nesting to make structure clearer
# -win1252 sets the input encoding of pml files
# -asxhtml convert to xhtml
# -q (quiet)
cmdline = 'tidy -w 120 -i -q -asxhtml -win1252 --enclose-text yes --doctype strict '
if sys.platform[0:3] == 'win':
cmdline = 'tidy.exe -w 120 -i -q -asxhtml -win1252 --enclose-text yes --doctype strict '
p2 = Popen(cmdline, shell=True, stdin=rawfobj, stdout=PIPE, stderr=PIPE, close_fds=False)
stdout, stderr = p2.communicate()
# print "Tidy Original Conversion Warnings and Errors"
# print stderr
return stdout
def usage():
print "Converts PML file to XHTML"
print "Usage:"
print " xpml2xhtml [options] infile.pml outfile.html "
print " "
print "Options: "
print " -h prints this message"
print " --sigil-breaks insert Sigil Chapterbbreaks"
print " --use-tidy use tidy to further clean up the html "
print " "
return
def main(argv=None):
global bookname
global footnote_ids
global sidebar_ids
global sigil_breaks
try:
opts, args = getopt.getopt(sys.argv[1:], "h", ["sigil-breaks", "use-tidy"])
except getopt.GetoptError, err:
print str(err)
usage()
return 1
if len(args) != 2:
usage()
return 1
sigil_breaks = False
use_tidy = False
for o, a in opts:
if o == "-h":
usage()
return 0
elif o == "--sigil-breaks":
sigil_breaks = True
elif o == "--use-tidy":
use_tidy = True
infile, outfile = args[0], args[1]
bookname = os.path.splitext(os.path.basename(infile))[0]
footnote_ids = { }
sidebar_ids = { }
try:
print "Processing..."
import time
start_time = time.time()
print " Converting pml to raw html"
pml_string = file(infile,'rb').read()
pml = PmlConverter(pml_string)
html_src = pml.process()
if use_tidy:
print " Tidying html to xhtml"
fobj = tempfile.NamedTemporaryFile(mode='w+b',suffix=".html",delete=False)
tempname = fobj.name
fobj.write(html_src)
fobj.close()
html_src = tidy(tempname)
os.remove(tempname)
file(outfile,'wb').write(html_src)
end_time = time.time()
convert_time = end_time - start_time
print 'elapsed time: %.2f seconds' % (convert_time, )
print 'output is in file %s' % outfile
print "Finished Processing"
except ValueError, e:
print "Error: %s" % e
return 1
return 0
if __name__ == "__main__":
#import cProfile
#command = """sys.exit(main())"""
#cProfile.runctx( command, globals(), locals(), filename="cprofile.profile" )
sys.exit(main())