ereader2html 0.09 by brutusbum, also splitting out into erdr2pml and xpml2xhtml as well, so that we get some history
This commit is contained in:
committed by
Apprentice Alf
parent
5115a4aed6
commit
e4a0f92846
@@ -44,10 +44,10 @@
|
||||
# Using that with Calibre works a lot better than the HTML
|
||||
# conversion in this code.
|
||||
# 0.07 - Further Improved type 272 support for sidebars with all earlier fixes
|
||||
# 0.07a - Fixed some typos
|
||||
# 0.08 - fixed typos, removed extraneous things
|
||||
# 0.09 - tried to greatly improve html conversion especially with \t tags
|
||||
|
||||
|
||||
__version__='0.07'
|
||||
__version__='0.09'
|
||||
|
||||
# Import Psyco if available
|
||||
try:
|
||||
@@ -80,7 +80,6 @@ import logging
|
||||
logging.basicConfig()
|
||||
#logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
write_pml=False
|
||||
|
||||
ECB = 0
|
||||
CBC = 1
|
||||
@@ -503,17 +502,25 @@ class EreaderProcessor(object):
|
||||
# the remaining records of the footnote sections need to be decoded with the content_key and zlib inflated
|
||||
des = Des(fixKey(self.content_key))
|
||||
r += '\\w="100%"'
|
||||
r += 'Footnotes\p'
|
||||
r += '\\pFootnotes:\n\n'
|
||||
for i in xrange(1,self.num_footnote_pages):
|
||||
logging.debug('get footnotepage %d', i)
|
||||
id_len = ord(fnote_ids[2])
|
||||
id = fnote_ids[3:3+id_len]
|
||||
fmarker='\Q="%s"' % id
|
||||
fmarker='\\t\\Q="footnote-%s"' % id
|
||||
r+=fmarker
|
||||
r += zlib.decompress(des.decrypt(self.section_reader(self.first_footnote_page + i)))
|
||||
r += '\n'
|
||||
r += '\\t\n\n'
|
||||
fnote_ids = fnote_ids[id_len+4:]
|
||||
|
||||
# according to ereader pml spec we should be outputing the following xml for each footnote - but then we would have to handle
|
||||
# parsing it back in to convert it since that xml is not valid xhtml
|
||||
# fmarker = '<footnote id="footnote-%s">\n' % id
|
||||
# fmarker += zlib.decompress(des.decrypt(self.section_reader(self.first_footnote_page + i)))
|
||||
# fmarker += '\n</footnote>\n'
|
||||
# r += fmarker
|
||||
|
||||
|
||||
# now handle sidebar pages
|
||||
if self.num_sidebar_pages > 0:
|
||||
# the record 0 of the sidebar section must pass through the Xor Table to make it useful
|
||||
@@ -522,20 +529,70 @@ class EreaderProcessor(object):
|
||||
# the remaining records of the sidebar sections need to be decoded with the content_key and zlib inflated
|
||||
des = Des(fixKey(self.content_key))
|
||||
r += '\\w="100%"'
|
||||
r += 'Sidebars\p'
|
||||
r += '\\pSidebars:\n\n'
|
||||
for i in xrange(1,self.num_sidebar_pages):
|
||||
id_len = ord(sbar_ids[2])
|
||||
id = sbar_ids[3:3+id_len]
|
||||
fmarker='\Q="%s"' % id
|
||||
r+=fmarker
|
||||
smarker='\\t\\Q="sidebar-%s"' % id
|
||||
r+=smarker
|
||||
r += zlib.decompress(des.decrypt(self.section_reader(self.first_sidebar_page + i)))
|
||||
r += '\n'
|
||||
r += '\\t\n\n'
|
||||
sbar_ids = sbar_ids[id_len+4:]
|
||||
|
||||
# according to ereader pml spec we should be outputing the following xml for each sidebar - but then we would have to handle
|
||||
# parsing it back in to convert it since that xml is not valid xhtml
|
||||
# smarker = '<sidebar id="sidebar-%s">\n' % id
|
||||
# smarker += zlib.decompress(des.decrypt(self.section_reader(self.first_footnote_page + i)))
|
||||
# smarker += '\n</sidebar>\n'
|
||||
# r += smarker
|
||||
|
||||
return r
|
||||
|
||||
class PmlConverter(object):
|
||||
def __init__(self, s, bkinfo):
|
||||
self.s = s
|
||||
def findPrevStartofLine(src,p,n):
|
||||
# find last end of previous line in substring from p to n
|
||||
b1 = src.rfind('\n',p,n)
|
||||
b2 = src.rfind('\\c',p,n)
|
||||
b3 = src.rfind('\\r',p,n)
|
||||
b4 = src.rfind('\\x',p,n)
|
||||
b5 = src.rfind('\\p',p,n)
|
||||
b = max(b1, b2, b3, b4, b5)
|
||||
if b == -1:
|
||||
return n
|
||||
if b == b1:
|
||||
return b + 1
|
||||
return b + 2
|
||||
def markHangingIndents(src):
|
||||
r = ''
|
||||
p = 0
|
||||
while True:
|
||||
if p > len(src):
|
||||
return r
|
||||
n = src.find('\\t', p)
|
||||
if n == -1:
|
||||
r += src[p:]
|
||||
return r
|
||||
pc = findPrevStartofLine(src,p,n)
|
||||
if pc == n :
|
||||
# \t tag is at start of line so indent block will work
|
||||
end = src.find('\\t',n+2)
|
||||
if end == -1:
|
||||
end = n
|
||||
r += src[p:end+2]
|
||||
p = end + 2
|
||||
else :
|
||||
# \t tag not at start of line so hanging indent case
|
||||
# recode \t to pseudo \h tags and move it to start of this line
|
||||
# and recode its close as well
|
||||
r += src[p:pc] + '\\h' + src[pc:n]
|
||||
end = src.find('\\t',n+2)
|
||||
if end == -1:
|
||||
end = n+2
|
||||
r += src[n+2:end] + '\\h'
|
||||
p = end + 2
|
||||
self.s = markHangingIndents(s)
|
||||
# file(os.path.join("./pseudo.pml"), 'wb').write(self.s)
|
||||
self.pos = 0
|
||||
self.bkinfo = bkinfo
|
||||
def nextOptAttr(self):
|
||||
@@ -560,7 +617,8 @@ class PmlConverter(object):
|
||||
self.pos = res
|
||||
return self.s[p : res], None, None
|
||||
c = self.s[p+1]
|
||||
if c in 'pxcriuovtnsblBk-lI\\d':
|
||||
# add in support for new pseudo tag \\h
|
||||
if c in 'pxcriuovthnsblBk-lI\\d':
|
||||
self.pos = p + 2
|
||||
return None, c, None
|
||||
if c in 'TwmqQ':
|
||||
@@ -583,48 +641,36 @@ class PmlConverter(object):
|
||||
self.pos = p + 1
|
||||
return None, None, None
|
||||
def LinePrinter(link):
|
||||
return '<hr width="%s" />' % link
|
||||
return '<hr width="%s" />\n' % link
|
||||
def LinkPrinter(link):
|
||||
return '<a href="%s">' % link
|
||||
def InternalLinkPrinter(link):
|
||||
return '<a href="#%s">' % link
|
||||
def FootnoteLinkPrinter(link):
|
||||
return '<a href="#footnote-%s">' % link
|
||||
def SidebarLinkPrinter(link):
|
||||
return '<a href="#sidebar-%s">' % link
|
||||
def NotSupported(link):
|
||||
raise NotImplemented()
|
||||
def NewChapterNewPage(link):
|
||||
raise NotImplemented()
|
||||
def ChapterTitle(link):
|
||||
print "Nonfatal Error: ChapterTitle not implemented."
|
||||
return '<!-- ChapterTitle %s -->' %link
|
||||
def IndentPercent(link):
|
||||
print "Nonfatal Error: IndentPercent not implemented."
|
||||
return '<!-- IndentPercent: %s -->' %link
|
||||
return '<span style="padding-left: %s%%;"></span>' %link
|
||||
def NormalFont(link):
|
||||
print "Nonfatal Error: NormalFont not implemented."
|
||||
return '<!-- NormalFont %s -->' %link
|
||||
def StdFont(link):
|
||||
print "Nonfatal Error: StdFont not implemented."
|
||||
return '<!-- StdFont: %s -->' %link
|
||||
def SingleBackslash(link):
|
||||
print "Nonfatal Error: SingleBackslash not implemented."
|
||||
return '<!-- SingleBackslash: %s -->' %link
|
||||
def SoftHyphen(link):
|
||||
print "Nonfatal Error: SoftHyphen not implemented."
|
||||
return '<!-- SoftHyphen: %s -->' %link
|
||||
def ReferenceIndexItem(link):
|
||||
print "Nonfatal Error: ReferenceIndexItem not implemented."
|
||||
return '<!-- CReferenceIndexItem: %s -->' %link
|
||||
|
||||
# See http://wiki.mobileread.com/wiki/PML#Palm_Markup_Language
|
||||
html_tags = {
|
||||
'c' : ('<div class="center">', '</div>'),
|
||||
'r' : ('<div class="right">', '</div>'),
|
||||
'i' : ('<i>', '</i>'),
|
||||
'u' : ('<u>', '</u>'),
|
||||
'u' : ('<span class="under">', '</span>'),
|
||||
'b' : ('<strong>', '</strong>'),
|
||||
'B' : ('<strong>', '</strong>'),
|
||||
'o' : ('<strike>', '</strike>'),
|
||||
'o' : ('<del>', '</del>'),
|
||||
'v' : ('<!-- ', ' -->'),
|
||||
't' : ('', ''),
|
||||
't' : ('<div class="indent">','</div>'),
|
||||
'h' : ('<div class="hang">','</div>'), # pseudo-tag created to handle hanging indent cases
|
||||
'Sb' : ('<sub>', '</sub>'),
|
||||
'Sp' : ('<sup>', '</sup>'),
|
||||
'X0' : ('<h1>', '</h1>'),
|
||||
@@ -632,28 +678,31 @@ class PmlConverter(object):
|
||||
'X2' : ('<h3>', '</h3>'),
|
||||
'X3' : ('<h4>', '</h4>'),
|
||||
'X4' : ('<h5>', '</h5>'),
|
||||
'l' : ('<font size="+2">', '</font>'),
|
||||
'l' : ('<span class="big">', '</span>'),
|
||||
'q' : (LinkPrinter, '</a>'),
|
||||
'Fn' : (InternalLinkPrinter, '</a>'),
|
||||
'Sd' : (InternalLinkPrinter, '</a>'),
|
||||
'Fn' : (FootnoteLinkPrinter, '</a>'),
|
||||
'Sd' : (SidebarLinkPrinter, '</a>'),
|
||||
'w' : (LinePrinter, ''),
|
||||
#'m' : handled in if block,
|
||||
#'Q' : handled in if block,
|
||||
#'a' : handled in if block,
|
||||
#'U' : handled in if block,
|
||||
'x' : ('<h1> class="breakbefore">', '</h1>'),
|
||||
'Cn' : (ChapterTitle, ''),
|
||||
'x' : ('<h1 class="breakbefore">', '</h1>'),
|
||||
#'C0' : handled in if block,
|
||||
#'C1' : handled in if block,
|
||||
#'C2' : handled in if block,
|
||||
#'C3' : handled in if block,
|
||||
#'C4' : handled in if block,
|
||||
'T' : (IndentPercent, ''),
|
||||
'n' : (NormalFont, ''),
|
||||
#'s' : (StdFont, ''),
|
||||
's' : ('', ''),
|
||||
'k' : ('<span style="font-variant: small-caps;">', '</span>'), # NOTE some pdb's then go ahead and use uppercase letters - which doesn't format the way one would expect (perhaps post process the output with html dom and lower ase is only upper case letters are found?)
|
||||
'\\' : (SingleBackslash, ''),
|
||||
'-' : (SoftHyphen, ''),
|
||||
'I' : (ReferenceIndexItem, ''),
|
||||
'k' : ('<span class="small">', '</span>'),
|
||||
'I' : ('<i>', '</i>'), # according to calibre - all ereader does is italicize the index entries
|
||||
}
|
||||
html_one_tags = {
|
||||
'p' : '<p class="breakafter"> </p>\n'
|
||||
'p' : '<p class="breakafter"> </p>\n',
|
||||
'\\': '\\',
|
||||
'-' : '­',
|
||||
}
|
||||
pml_chars = {
|
||||
160 : ' ',130 : '—',131: 'ƒ',132: '„',
|
||||
@@ -674,7 +723,17 @@ class PmlConverter(object):
|
||||
final += '<meta name="Copyright" content="%s"/>\n' % copyright
|
||||
final += '<meta name="Publisher" content="%s"/>\n' % publisher
|
||||
final += '<meta name="ISBN" content="%s"/>\n' % isbn
|
||||
final += '<style type="text/css">\ndiv.center { text-align:center; }\ndiv.right { text-align:right; }\n.breakbefore { page-break-before: always; }\n.breakafter { page-break-after: always; }\n</style>\n'
|
||||
final += '<style type="text/css">\n'
|
||||
final += 'div.center { text-align:center; }\n'
|
||||
final += 'div.right { text-align:right; }\n'
|
||||
final += 'div.indent { margin-left: 5%; }\n'
|
||||
final += 'div.hang { text-indent: -5%; margin-left: 5%; }\n'
|
||||
final += 'span.big { font-size: 175%; }\n'
|
||||
final += 'span.small { font-size: 50%; }\n'
|
||||
final += 'span.under { text-decoration: underline; }\n'
|
||||
final += '.breakbefore { page-break-before: always; }\n'
|
||||
final += '.breakafter { page-break-after: always; }\n'
|
||||
final += '</style>\n'
|
||||
final += '</head><body>\n'
|
||||
in_tags = []
|
||||
def makeText(s):
|
||||
@@ -682,7 +741,7 @@ class PmlConverter(object):
|
||||
#s = s.replace('"', '"')
|
||||
s = s.replace('<', '<')
|
||||
s = s.replace('>', '>')
|
||||
s = s.replace('\n', '<br>\n')
|
||||
s = s.replace('\n', '<br />\n')
|
||||
return s
|
||||
while True:
|
||||
r = self.next()
|
||||
@@ -698,7 +757,6 @@ class PmlConverter(object):
|
||||
if type(r) != str:
|
||||
r = r(attr)
|
||||
return r
|
||||
|
||||
if cmd in self.html_tags:
|
||||
pair = (cmd, attr)
|
||||
if cmd not in [a for (a,b) in in_tags]:
|
||||
@@ -719,21 +777,33 @@ class PmlConverter(object):
|
||||
if cmd in self.html_one_tags:
|
||||
final += self.html_one_tags[cmd]
|
||||
if cmd == 'm':
|
||||
unquotedimagepath = bookname + "_img/" + attr
|
||||
unquotedimagepath = "images/" + attr
|
||||
imagepath = urllib.quote( unquotedimagepath )
|
||||
final += '<img src="%s" alt="">' % imagepath
|
||||
if cmd == 'Q':
|
||||
final += '<a name="%s" id="%s"> </a>' % (attr, attr)
|
||||
final += '<span id="%s"> </span>' % attr
|
||||
if cmd == 'C0':
|
||||
final += '<!-- ContentsList "%s" -->' % attr
|
||||
if cmd == 'C1':
|
||||
final += '<!-- ContentsList " %s" -->' % attr
|
||||
if cmd == 'C2':
|
||||
final += '<!-- ContentsList " %s" -->' % attr
|
||||
if cmd == 'C3':
|
||||
final += '<!-- ContentsList " %s" -->' % attr
|
||||
if cmd == 'C4':
|
||||
final += '<!-- ContentsList " %s" -->' % attr
|
||||
if cmd == 'a':
|
||||
final += self.pml_chars.get(attr, '&#%d;' % attr)
|
||||
if cmd == 'U':
|
||||
final += '&#%d;' % attr
|
||||
final += '</body></html>\n'
|
||||
while True:
|
||||
s = final.replace('<br>\n<br>\n<br>\n', '<br>\n<br>\n')
|
||||
if s == final:
|
||||
break
|
||||
final = s
|
||||
# while True:
|
||||
# s = final.replace('<br />\n<br />\n<br />\n', '<br />\n<br />\n')
|
||||
# if s == final:
|
||||
# break
|
||||
# final = s
|
||||
s = final.replace('</div><br />','</div>\n')
|
||||
final = s
|
||||
return final
|
||||
|
||||
def convertEreaderToHtml(infile, name, cc, outdir):
|
||||
@@ -743,7 +813,7 @@ def convertEreaderToHtml(infile, name, cc, outdir):
|
||||
er = EreaderProcessor(sect.loadSection, name, cc)
|
||||
|
||||
if er.getNumImages() > 0:
|
||||
imagedir = bookname + "_img"
|
||||
imagedir = "images/"
|
||||
imagedirpath = os.path.join(outdir,imagedir)
|
||||
if not os.path.exists(imagedirpath):
|
||||
os.makedirs(imagedirpath)
|
||||
@@ -761,14 +831,14 @@ def convertEreaderToHtml(infile, name, cc, outdir):
|
||||
htmlfilename = bookname + ".html"
|
||||
file(os.path.join(outdir, htmlfilename),'wb').write(pml.process())
|
||||
|
||||
ts = er.getExpandedTextSizesData()
|
||||
file(os.path.join(outdir, 'xtextsizes.dat'), 'wb').write(ts)
|
||||
# ts = er.getExpandedTextSizesData()
|
||||
# file(os.path.join(outdir, 'xtextsizes.dat'), 'wb').write(ts)
|
||||
|
||||
cv = er.getChapterNamePMLOffsetData()
|
||||
file(os.path.join(outdir, 'chapters.dat'), 'wb').write(cv)
|
||||
|
||||
lv = er.getLinkNamePMLOffsetData()
|
||||
file(os.path.join(outdir, 'links.dat'), 'wb').write(lv)
|
||||
# lv = er.getLinkNamePMLOffsetData()
|
||||
# file(os.path.join(outdir, 'links.dat'), 'wb').write(lv)
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
@@ -811,4 +881,4 @@ if __name__ == "__main__":
|
||||
#command = """sys.exit(main())"""
|
||||
#cProfile.runctx( command, globals(), locals(), filename="cprofile.profile" )
|
||||
|
||||
sys.exit(main())
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user