tools v4.8
This commit is contained in:
@@ -20,6 +20,8 @@ import getopt
|
||||
from struct import pack
|
||||
from struct import unpack
|
||||
|
||||
class TpzDRMError(Exception):
|
||||
pass
|
||||
|
||||
# Get a 7 bit encoded number from string. The most
|
||||
# significant byte comes first and has the high bit (8th) set
|
||||
@@ -138,7 +140,8 @@ class Dictionary(object):
|
||||
return self.stable[self.pos]
|
||||
else:
|
||||
print "Error - %d outside of string table limits" % val
|
||||
sys.exit(-1)
|
||||
raise TpzDRMError('outside of string table limits')
|
||||
# sys.exit(-1)
|
||||
|
||||
def getSize(self):
|
||||
return self.size
|
||||
@@ -258,6 +261,11 @@ class PageParser(object):
|
||||
'paragraph.class' : (1, 'scalar_text', 0, 0),
|
||||
'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
|
||||
'word_semantic' : (1, 'snippets', 1, 1),
|
||||
'word_semantic.type' : (1, 'scalar_text', 0, 0),
|
||||
@@ -272,11 +280,17 @@ class PageParser(object):
|
||||
|
||||
'_span' : (1, 'snippets', 1, 0),
|
||||
'_span.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
'-span.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'_span.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'_span.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
'_span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
'_span.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
'span' : (1, 'snippets', 1, 0),
|
||||
'span.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
'span.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'span.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
'span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
'span.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
'extratokens' : (1, 'snippets', 1, 0),
|
||||
'extratokens.type' : (1, 'scalar_text', 0, 0),
|
||||
|
||||
@@ -59,8 +59,11 @@
|
||||
# 0.18 - on Windows try PyCrypto first and OpenSSL next
|
||||
# 0.19 - Modify the interface to allow use of import
|
||||
# 0.20 - modify to allow use inside new interface for calibre plugins
|
||||
# 0.21 - Support eReader (drm) version 11.
|
||||
# - Don't reject dictionary format.
|
||||
# - Ignore sidebars for dictionaries (different format?)
|
||||
|
||||
__version__='0.20'
|
||||
__version__='0.21'
|
||||
|
||||
class Unbuffered:
|
||||
def __init__(self, stream):
|
||||
@@ -140,12 +143,18 @@ logging.basicConfig()
|
||||
|
||||
|
||||
class Sectionizer(object):
|
||||
bkType = "Book"
|
||||
|
||||
def __init__(self, filename, ident):
|
||||
self.contents = file(filename, 'rb').read()
|
||||
self.header = self.contents[0:72]
|
||||
self.num_sections, = struct.unpack('>H', self.contents[76:78])
|
||||
# Dictionary or normal content (TODO: Not hard-coded)
|
||||
if self.header[0x3C:0x3C+8] != ident:
|
||||
raise ValueError('Invalid file format')
|
||||
if self.header[0x3C:0x3C+8] == "PDctPPrs":
|
||||
self.bkType = "Dict"
|
||||
else:
|
||||
raise ValueError('Invalid file format')
|
||||
self.sections = []
|
||||
for i in xrange(self.num_sections):
|
||||
offset, a1,a2,a3,a4 = struct.unpack('>LBBBB', self.contents[78+i*8:78+i*8+8])
|
||||
@@ -182,15 +191,15 @@ def deXOR(text, sp, table):
|
||||
return r
|
||||
|
||||
class EreaderProcessor(object):
|
||||
def __init__(self, section_reader, username, creditcard):
|
||||
self.section_reader = section_reader
|
||||
data = section_reader(0)
|
||||
def __init__(self, sect, username, creditcard):
|
||||
self.section_reader = sect.loadSection
|
||||
data = self.section_reader(0)
|
||||
version, = struct.unpack('>H', data[0:2])
|
||||
self.version = version
|
||||
logging.info('eReader file format version %s', version)
|
||||
if version != 272 and version != 260 and version != 259:
|
||||
raise ValueError('incorrect eReader version %d (error 1)' % version)
|
||||
data = section_reader(1)
|
||||
data = self.section_reader(1)
|
||||
self.data = data
|
||||
des = Des(fixKey(data[0:8]))
|
||||
cookie_shuf, cookie_size = struct.unpack('>LL', des.decrypt(data[-8:]))
|
||||
@@ -219,11 +228,17 @@ class EreaderProcessor(object):
|
||||
self.num_text_pages = struct.unpack('>H', r[2:4])[0] - 1
|
||||
self.num_image_pages = struct.unpack('>H', r[26:26+2])[0]
|
||||
self.first_image_page = struct.unpack('>H', r[24:24+2])[0]
|
||||
# Default values
|
||||
self.num_footnote_pages = 0
|
||||
self.num_sidebar_pages = 0
|
||||
self.first_footnote_page = -1
|
||||
self.first_sidebar_page = -1
|
||||
if self.version == 272:
|
||||
self.num_footnote_pages = struct.unpack('>H', r[46:46+2])[0]
|
||||
self.first_footnote_page = struct.unpack('>H', r[44:44+2])[0]
|
||||
self.num_sidebar_pages = struct.unpack('>H', r[38:38+2])[0]
|
||||
self.first_sidebar_page = struct.unpack('>H', r[36:36+2])[0]
|
||||
if (sect.bkType == "Book"):
|
||||
self.num_sidebar_pages = struct.unpack('>H', r[38:38+2])[0]
|
||||
self.first_sidebar_page = struct.unpack('>H', r[36:36+2])[0]
|
||||
# self.num_bookinfo_pages = struct.unpack('>H', r[34:34+2])[0]
|
||||
# self.first_bookinfo_page = struct.unpack('>H', r[32:32+2])[0]
|
||||
# self.num_chapter_pages = struct.unpack('>H', r[22:22+2])[0]
|
||||
@@ -239,10 +254,8 @@ class EreaderProcessor(object):
|
||||
self.xortable_size = struct.unpack('>H', r[42:42+2])[0]
|
||||
self.xortable = self.data[self.xortable_offset:self.xortable_offset + self.xortable_size]
|
||||
else:
|
||||
self.num_footnote_pages = 0
|
||||
self.num_sidebar_pages = 0
|
||||
self.first_footnote_page = -1
|
||||
self.first_sidebar_page = -1
|
||||
# Nothing needs to be done
|
||||
pass
|
||||
# self.num_bookinfo_pages = 0
|
||||
# self.num_chapter_pages = 0
|
||||
# self.num_link_pages = 0
|
||||
@@ -267,10 +280,14 @@ class EreaderProcessor(object):
|
||||
encrypted_key_sha = r[44:44+20]
|
||||
encrypted_key = r[64:64+8]
|
||||
elif version == 260:
|
||||
if drm_sub_version != 13:
|
||||
if drm_sub_version != 13 and drm_sub_version != 11:
|
||||
raise ValueError('incorrect eReader version %d (error 3)' % drm_sub_version)
|
||||
encrypted_key = r[44:44+8]
|
||||
encrypted_key_sha = r[52:52+20]
|
||||
if drm_sub_version == 13:
|
||||
encrypted_key = r[44:44+8]
|
||||
encrypted_key_sha = r[52:52+20]
|
||||
else:
|
||||
encrypted_key = r[64:64+8]
|
||||
encrypted_key_sha = r[44:44+20]
|
||||
elif version == 272:
|
||||
encrypted_key = r[172:172+8]
|
||||
encrypted_key_sha = r[56:56+20]
|
||||
@@ -356,6 +373,12 @@ class EreaderProcessor(object):
|
||||
r += fmarker
|
||||
fnote_ids = fnote_ids[id_len+4:]
|
||||
|
||||
# TODO: Handle dictionary index (?) pages - which are also marked as
|
||||
# sidebar_pages (?). For now dictionary sidebars are ignored
|
||||
# For dictionaries - record 0 is null terminated strings, followed by
|
||||
# blocks of around 62000 bytes and a final block. Not sure of the
|
||||
# encoding
|
||||
|
||||
# now handle sidebar pages
|
||||
if self.num_sidebar_pages > 0:
|
||||
r += '\n'
|
||||
@@ -368,7 +391,7 @@ class EreaderProcessor(object):
|
||||
id_len = ord(sbar_ids[2])
|
||||
id = sbar_ids[3:3+id_len]
|
||||
smarker = '<sidebar id="%s">\n' % id
|
||||
smarker += zlib.decompress(des.decrypt(self.section_reader(self.first_footnote_page + i)))
|
||||
smarker += zlib.decompress(des.decrypt(self.section_reader(self.first_sidebar_page + i)))
|
||||
smarker += '\n</sidebar>\n'
|
||||
r += smarker
|
||||
sbar_ids = sbar_ids[id_len+4:]
|
||||
@@ -389,7 +412,7 @@ def convertEreaderToPml(infile, name, cc, outdir):
|
||||
bookname = os.path.splitext(os.path.basename(infile))[0]
|
||||
print " Decoding File"
|
||||
sect = Sectionizer(infile, 'PNRdPPrs')
|
||||
er = EreaderProcessor(sect.loadSection, name, cc)
|
||||
er = EreaderProcessor(sect, name, cc)
|
||||
|
||||
if er.getNumImages() > 0:
|
||||
print " Extracting images"
|
||||
|
||||
@@ -271,6 +271,9 @@ class DocParser(object):
|
||||
|
||||
pclass = self.getClass(pclass)
|
||||
|
||||
# if paragraph uses extratokens (extra glyphs) then make it fixed
|
||||
(pos, extraglyphs) = self.findinDoc('paragraph.extratokens',start,end)
|
||||
|
||||
# build up a description of the paragraph in result and return it
|
||||
# first check for the basic - all words paragraph
|
||||
(pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
|
||||
@@ -280,6 +283,7 @@ class DocParser(object):
|
||||
last = int(slast)
|
||||
|
||||
makeImage = (regtype == 'vertical') or (regtype == 'table')
|
||||
makeImage = makeImage or (extraglyphs != None)
|
||||
if self.fixedimage:
|
||||
makeImage = makeImage or (regtype == 'fixed')
|
||||
|
||||
@@ -353,6 +357,8 @@ class DocParser(object):
|
||||
|
||||
word_class = ''
|
||||
|
||||
word_semantic_type = ''
|
||||
|
||||
while (line < end) :
|
||||
|
||||
(name, argres) = self.lineinDoc(line)
|
||||
@@ -512,6 +518,72 @@ class DocParser(object):
|
||||
return parares
|
||||
|
||||
|
||||
def buildTOCEntry(self, pdesc) :
|
||||
parares = ''
|
||||
sep =''
|
||||
tocentry = ''
|
||||
handle_links = len(self.link_id) > 0
|
||||
|
||||
lstart = 0
|
||||
|
||||
cnt = len(pdesc)
|
||||
for j in xrange( 0, cnt) :
|
||||
|
||||
(wtype, num) = pdesc[j]
|
||||
|
||||
if wtype == 'ocr' :
|
||||
word = self.ocrtext[num]
|
||||
sep = ' '
|
||||
|
||||
if handle_links:
|
||||
link = self.link_id[num]
|
||||
if (link > 0):
|
||||
linktype = self.link_type[link-1]
|
||||
title = self.link_title[link-1]
|
||||
title = title.rstrip('. ')
|
||||
alt_title = parares[lstart:]
|
||||
alt_title = alt_title.strip()
|
||||
# now strip off the actual printed page number
|
||||
alt_title = alt_title.rstrip('01234567890ivxldIVXLD-.')
|
||||
alt_title = alt_title.rstrip('. ')
|
||||
# skip over any external links - can't have them in a books toc
|
||||
if linktype == 'external' :
|
||||
title = ''
|
||||
alt_title = ''
|
||||
linkpage = ''
|
||||
else :
|
||||
if len(self.link_page) >= link :
|
||||
ptarget = self.link_page[link-1] - 1
|
||||
linkpage = '%04d' % ptarget
|
||||
else :
|
||||
# just link to the current page
|
||||
linkpage = self.id[4:]
|
||||
if len(alt_title) >= len(title):
|
||||
title = alt_title
|
||||
if title != '' and linkpage != '':
|
||||
tocentry += title + '|' + linkpage + '\n'
|
||||
lstart = len(parares)
|
||||
if word == '_link_' : word = ''
|
||||
elif (link < 0) :
|
||||
if word == '_link_' : word = ''
|
||||
|
||||
if word == '_lb_':
|
||||
word = ''
|
||||
sep = ''
|
||||
|
||||
if num in self.dehyphen_rootid :
|
||||
word = word[0:-1]
|
||||
sep = ''
|
||||
|
||||
parares += word + sep
|
||||
|
||||
else :
|
||||
continue
|
||||
|
||||
return tocentry
|
||||
|
||||
|
||||
|
||||
|
||||
# walk the document tree collecting the information needed
|
||||
# to build an html page using the ocrText
|
||||
@@ -519,6 +591,7 @@ class DocParser(object):
|
||||
def process(self):
|
||||
|
||||
htmlpage = ''
|
||||
tocinfo = ''
|
||||
|
||||
# get the ocr text
|
||||
(pos, argres) = self.findinDoc('info.word.ocrText',0,-1)
|
||||
@@ -644,9 +717,9 @@ class DocParser(object):
|
||||
ptype = 'end'
|
||||
first_para_continued = False
|
||||
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
|
||||
tocinfo += self.buildTOCEntry(pdesc)
|
||||
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
|
||||
|
||||
|
||||
elif (regtype == 'vertical') or (regtype == 'table') :
|
||||
ptype = 'full'
|
||||
if inGroup:
|
||||
@@ -704,12 +777,11 @@ class DocParser(object):
|
||||
htmlpage = htmlpage[0:-4]
|
||||
last_para_continued = False
|
||||
|
||||
return htmlpage
|
||||
|
||||
return htmlpage, tocinfo
|
||||
|
||||
|
||||
def convert2HTML(flatxml, classlst, fileid, bookDir, gdict, fixedimage):
|
||||
# create a document parser
|
||||
dp = DocParser(flatxml, classlst, fileid, bookDir, gdict, fixedimage)
|
||||
htmlpage = dp.process()
|
||||
return htmlpage
|
||||
htmlpage, tocinfo = dp.process()
|
||||
return htmlpage, tocinfo
|
||||
|
||||
@@ -10,17 +10,94 @@ from struct import unpack
|
||||
|
||||
|
||||
class PParser(object):
|
||||
def __init__(self, gd, flatxml):
|
||||
def __init__(self, gd, flatxml, meta_array):
|
||||
self.gd = gd
|
||||
self.flatdoc = flatxml.split('\n')
|
||||
self.docSize = len(self.flatdoc)
|
||||
self.temp = []
|
||||
foo = self.getData('page.h') or self.getData('book.h')
|
||||
self.ph = foo[0]
|
||||
foo = self.getData('page.w') or self.getData('book.w')
|
||||
self.pw = foo[0]
|
||||
self.gx = self.getData('info.glyph.x')
|
||||
self.gy = self.getData('info.glyph.y')
|
||||
self.gid = self.getData('info.glyph.glyphID')
|
||||
|
||||
self.ph = -1
|
||||
self.pw = -1
|
||||
startpos = self.posinDoc('page.h') or self.posinDoc('book.h')
|
||||
for p in startpos:
|
||||
(name, argres) = self.lineinDoc(p)
|
||||
self.ph = max(self.ph, int(argres))
|
||||
startpos = self.posinDoc('page.w') or self.posinDoc('book.w')
|
||||
for p in startpos:
|
||||
(name, argres) = self.lineinDoc(p)
|
||||
self.pw = max(self.pw, int(argres))
|
||||
|
||||
if self.ph <= 0:
|
||||
self.ph = int(meta_array.get('pageHeight', '11000'))
|
||||
if self.pw <= 0:
|
||||
self.pw = int(meta_array.get('pageWidth', '8500'))
|
||||
|
||||
res = []
|
||||
startpos = self.posinDoc('info.glyph.x')
|
||||
for p in startpos:
|
||||
argres = self.getDataatPos('info.glyph.x', p)
|
||||
res.extend(argres)
|
||||
self.gx = res
|
||||
|
||||
res = []
|
||||
startpos = self.posinDoc('info.glyph.y')
|
||||
for p in startpos:
|
||||
argres = self.getDataatPos('info.glyph.y', p)
|
||||
res.extend(argres)
|
||||
self.gy = res
|
||||
|
||||
res = []
|
||||
startpos = self.posinDoc('info.glyph.glyphID')
|
||||
for p in startpos:
|
||||
argres = self.getDataatPos('info.glyph.glyphID', p)
|
||||
res.extend(argres)
|
||||
self.gid = res
|
||||
|
||||
|
||||
# return tag at line pos in document
|
||||
def lineinDoc(self, pos) :
|
||||
if (pos >= 0) and (pos < self.docSize) :
|
||||
item = self.flatdoc[pos]
|
||||
if item.find('=') >= 0:
|
||||
(name, argres) = item.split('=',1)
|
||||
else :
|
||||
name = item
|
||||
argres = ''
|
||||
return name, argres
|
||||
|
||||
# find tag in doc if within pos to end inclusive
|
||||
def findinDoc(self, tagpath, pos, end) :
|
||||
result = None
|
||||
if end == -1 :
|
||||
end = self.docSize
|
||||
else:
|
||||
end = min(self.docSize, end)
|
||||
foundat = -1
|
||||
for j in xrange(pos, end):
|
||||
item = self.flatdoc[j]
|
||||
if item.find('=') >= 0:
|
||||
(name, argres) = item.split('=',1)
|
||||
else :
|
||||
name = item
|
||||
argres = ''
|
||||
if name.endswith(tagpath) :
|
||||
result = argres
|
||||
foundat = j
|
||||
break
|
||||
return foundat, result
|
||||
|
||||
# return list of start positions for the tagpath
|
||||
def posinDoc(self, tagpath):
|
||||
startpos = []
|
||||
pos = 0
|
||||
res = ""
|
||||
while res != None :
|
||||
(foundpos, res) = self.findinDoc(tagpath, pos, -1)
|
||||
if res != None :
|
||||
startpos.append(foundpos)
|
||||
pos = foundpos + 1
|
||||
return startpos
|
||||
|
||||
def getData(self, path):
|
||||
result = None
|
||||
cnt = len(self.flatdoc)
|
||||
@@ -39,6 +116,23 @@ class PParser(object):
|
||||
for j in xrange(0,len(argres)):
|
||||
argres[j] = int(argres[j])
|
||||
return result
|
||||
|
||||
def getDataatPos(self, path, pos):
|
||||
result = None
|
||||
item = self.flatdoc[pos]
|
||||
if item.find('=') >= 0:
|
||||
(name, argt) = item.split('=')
|
||||
argres = argt.split('|')
|
||||
else:
|
||||
name = item
|
||||
argres = []
|
||||
if (len(argres) > 0) :
|
||||
for j in xrange(0,len(argres)):
|
||||
argres[j] = int(argres[j])
|
||||
if (name.endswith(path)):
|
||||
result = argres
|
||||
return result
|
||||
|
||||
def getDataTemp(self, path):
|
||||
result = None
|
||||
cnt = len(self.temp)
|
||||
@@ -58,6 +152,7 @@ class PParser(object):
|
||||
for j in xrange(0,len(argres)):
|
||||
argres[j] = int(argres[j])
|
||||
return result
|
||||
|
||||
def getImages(self):
|
||||
result = []
|
||||
self.temp = self.flatdoc
|
||||
@@ -69,6 +164,7 @@ class PParser(object):
|
||||
src = self.getDataTemp('img.src')[0]
|
||||
result.append('<image xlink:href="../img/img%04d.jpg" x="%d" y="%d" width="%d" height="%d" />\n' % (src, x, y, w, h))
|
||||
return result
|
||||
|
||||
def getGlyphs(self):
|
||||
result = []
|
||||
if (self.gid != None) and (len(self.gid) > 0):
|
||||
@@ -84,25 +180,25 @@ class PParser(object):
|
||||
return result
|
||||
|
||||
|
||||
def convert2SVG(gdict, flat_xml, counter, numfiles, svgDir, raw, meta_array, scaledpi):
|
||||
def convert2SVG(gdict, flat_xml, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi):
|
||||
ml = ''
|
||||
pp = PParser(gdict, flat_xml)
|
||||
pp = PParser(gdict, flat_xml, meta_array)
|
||||
ml += '<?xml version="1.0" standalone="no"?>\n'
|
||||
if (raw):
|
||||
ml += '<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n'
|
||||
ml += '<svg width="%fin" height="%fin" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1)
|
||||
ml += '<title>Page %d - %s by %s</title>\n' % (counter, meta_array['Title'],meta_array['Authors'])
|
||||
ml += '<title>Page %d - %s by %s</title>\n' % (pageid, meta_array['Title'],meta_array['Authors'])
|
||||
else:
|
||||
ml += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
|
||||
ml += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" ><head>\n'
|
||||
ml += '<title>Page %d - %s by %s</title>\n' % (counter, meta_array['Title'],meta_array['Authors'])
|
||||
ml += '<title>Page %d - %s by %s</title>\n' % (pageid, meta_array['Title'],meta_array['Authors'])
|
||||
ml += '<script><![CDATA[\n'
|
||||
ml += 'function gd(){var p=window.location.href.replace(/^.*\?dpi=(\d+).*$/i,"$1");return p;}\n'
|
||||
ml += 'var dpi=%d;\n' % scaledpi
|
||||
if (counter) :
|
||||
ml += 'var prevpage="page%04d.xhtml";\n' % (counter - 1)
|
||||
if (counter < numfiles-1) :
|
||||
ml += 'var nextpage="page%04d.xhtml";\n' % (counter + 1)
|
||||
if (previd) :
|
||||
ml += 'var prevpage="page%04d.xhtml";\n' % (previd)
|
||||
if (nextid) :
|
||||
ml += 'var nextpage="page%04d.xhtml";\n' % (nextid)
|
||||
ml += 'var pw=%d;var ph=%d;' % (pp.pw, pp.ph)
|
||||
ml += 'function zoomin(){dpi=dpi*(0.8);setsize();}\n'
|
||||
ml += 'function zoomout(){dpi=dpi*1.25;setsize();}\n'
|
||||
@@ -115,10 +211,11 @@ def convert2SVG(gdict, flat_xml, counter, numfiles, svgDir, raw, meta_array, sca
|
||||
ml += '</head>\n'
|
||||
ml += '<body onLoad="setsize();" style="background-color:#777;text-align:center;">\n'
|
||||
ml += '<div style="white-space:nowrap;">\n'
|
||||
if (counter == 0) :
|
||||
if previd == None:
|
||||
ml += '<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n'
|
||||
else:
|
||||
ml += '<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,150,95,5,95,295" fill="#AAAAAA" /></svg></a>\n'
|
||||
|
||||
ml += '<a href="javascript:npage();"><svg id="svgimg" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" style="background-color:#FFF;border:1px solid black;">' % (pp.pw, pp.ph)
|
||||
if (pp.gid != None):
|
||||
ml += '<defs>\n'
|
||||
@@ -134,12 +231,14 @@ def convert2SVG(gdict, flat_xml, counter, numfiles, svgDir, raw, meta_array, sca
|
||||
for j in xrange(0,len(pp.gid)):
|
||||
ml += '<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (pp.gid[j], pp.gx[j], pp.gy[j])
|
||||
if (img == None or len(img) == 0) and (pp.gid == None or len(pp.gid) == 0):
|
||||
ml += '<text x="10" y="10" font-family="Helvetica" font-size="100" stroke="black">This page intentionally left blank.</text>\n<text x="10" y="110" font-family="Helvetica" font-size="50" stroke="black">Until this notice unintentionally gave it content. (gensvg.py)</text>\n'
|
||||
xpos = "%d" % (pp.pw // 3)
|
||||
ypos = "%d" % (pp.ph // 3)
|
||||
ml += '<text x="' + xpos + '" y="' + ypos + '" font-size="' + meta_array['fontSize'] + '" font-family="Helvetica" stroke="black">This page intentionally left blank.</text>\n'
|
||||
if (raw) :
|
||||
ml += '</svg>'
|
||||
else :
|
||||
ml += '</svg></a>\n'
|
||||
if (counter == numfiles - 1) :
|
||||
if nextid == None:
|
||||
ml += '<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n'
|
||||
else :
|
||||
ml += '<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,5,5,295,95,150" fill="#AAAAAA" /></svg></a>\n'
|
||||
|
||||
@@ -19,6 +19,8 @@ import getopt
|
||||
from struct import pack
|
||||
from struct import unpack
|
||||
|
||||
class TpzDRMError(Exception):
|
||||
pass
|
||||
|
||||
# local support routines
|
||||
if 'calibre' in sys.modules:
|
||||
@@ -114,7 +116,8 @@ class Dictionary(object):
|
||||
return self.stable[self.pos]
|
||||
else:
|
||||
print "Error - %d outside of string table limits" % val
|
||||
sys.exit(-1)
|
||||
raise TpzDRMError('outside or string table limits')
|
||||
# sys.exit(-1)
|
||||
def getSize(self):
|
||||
return self.size
|
||||
def getPos(self):
|
||||
@@ -371,10 +374,34 @@ def generateBook(bookDir, raw, fixedimage):
|
||||
(ph, pw) = getPageDim(flat_xml)
|
||||
if (ph == '-1') or (ph == '0') : ph = '11000'
|
||||
if (pw == '-1') or (pw == '0') : pw = '8500'
|
||||
meta_array['pageHeight'] = ph
|
||||
meta_array['pageWidth'] = pw
|
||||
if 'fontSize' not in meta_array.keys():
|
||||
meta_array['fontSize'] = fontsize
|
||||
|
||||
# print ' ', 'other0000.dat'
|
||||
# process other.dat for css info and for map of page files to svg images
|
||||
# this map is needed because some pages actually are made up of multiple
|
||||
# pageXXXX.xml files
|
||||
xname = os.path.join(bookDir, 'style.css')
|
||||
flat_xml = convert2xml.fromData(dict, otherFile)
|
||||
|
||||
# extract info.original.pid to get original page information
|
||||
pageIDMap = {}
|
||||
pageidnums = stylexml2css.getpageIDMap(flat_xml)
|
||||
if len(pageidnums) == 0:
|
||||
filenames = os.listdir(pageDir)
|
||||
numfiles = len(filenames)
|
||||
for k in range(numfiles):
|
||||
pageidnums.append(k)
|
||||
# create a map from page ids to list of page file nums to process for that page
|
||||
for i in range(len(pageidnums)):
|
||||
id = pageidnums[i]
|
||||
if id in pageIDMap.keys():
|
||||
pageIDMap[id].append(i)
|
||||
else:
|
||||
pageIDMap[id] = [i]
|
||||
|
||||
# now get the css info
|
||||
cssstr , classlst = stylexml2css.convert2CSS(flat_xml, fontsize, ph, pw)
|
||||
file(xname, 'wb').write(cssstr)
|
||||
xname = os.path.join(xmlDir, 'other0000.xml')
|
||||
@@ -414,6 +441,9 @@ def generateBook(bookDir, raw, fixedimage):
|
||||
glyfile.close()
|
||||
print " "
|
||||
|
||||
# build up tocentries while processing html
|
||||
tocentries = ''
|
||||
|
||||
# start up the html
|
||||
htmlFileName = "book.html"
|
||||
htmlstr = '<?xml version="1.0" encoding="utf-8"?>\n'
|
||||
@@ -436,6 +466,77 @@ def generateBook(bookDir, raw, fixedimage):
|
||||
# readability when rendering to the screen.
|
||||
scaledpi = 1440.0
|
||||
|
||||
filenames = os.listdir(pageDir)
|
||||
filenames = sorted(filenames)
|
||||
numfiles = len(filenames)
|
||||
|
||||
xmllst = []
|
||||
|
||||
for filename in filenames:
|
||||
# print ' ', filename
|
||||
print ".",
|
||||
fname = os.path.join(pageDir,filename)
|
||||
flat_xml = convert2xml.fromData(dict, fname)
|
||||
|
||||
# keep flat_xml for later svg processing
|
||||
xmllst.append(flat_xml)
|
||||
|
||||
xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
|
||||
file(xname, 'wb').write(convert2xml.getXML(dict, fname))
|
||||
|
||||
# first get the html
|
||||
pagehtml, tocinfo = flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir, gd, fixedimage)
|
||||
tocentries += tocinfo
|
||||
htmlstr += pagehtml
|
||||
|
||||
# finish up the html string and output it
|
||||
htmlstr += '</body>\n</html>\n'
|
||||
file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
|
||||
|
||||
print " "
|
||||
print 'Extracting Table of Contents from Amazon OCR'
|
||||
|
||||
# first create a table of contents file for the svg images
|
||||
tochtml = '<?xml version="1.0" encoding="utf-8"?>\n'
|
||||
tochtml += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
|
||||
tochtml += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >'
|
||||
tochtml += '<head>\n'
|
||||
tochtml += '<title>' + meta_array['Title'] + '</title>\n'
|
||||
tochtml += '<meta name="Author" content="' + meta_array['Authors'] + '" />\n'
|
||||
tochtml += '<meta name="Title" content="' + meta_array['Title'] + '" />\n'
|
||||
if 'ASIN' in meta_array:
|
||||
tochtml += '<meta name="ASIN" content="' + meta_array['ASIN'] + '" />\n'
|
||||
if 'GUID' in meta_array:
|
||||
tochtml += '<meta name="GUID" content="' + meta_array['GUID'] + '" />\n'
|
||||
tochtml += '</head>\n'
|
||||
tochtml += '<body>\n'
|
||||
|
||||
tochtml += '<h2>Table of Contents</h2>\n'
|
||||
start = pageidnums[0]
|
||||
if (raw):
|
||||
startname = 'page%04d.svg' % start
|
||||
else:
|
||||
startname = 'page%04d.xhtml' % start
|
||||
|
||||
tochtml += '<h3><a href="' + startname + '">Start of Book</a></h3>\n'
|
||||
# build up a table of contents for the svg xhtml output
|
||||
toclst = tocentries.split('\n')
|
||||
toclst.pop()
|
||||
for entry in toclst:
|
||||
print entry
|
||||
title, pagenum = entry.split('|')
|
||||
id = pageidnums[int(pagenum)]
|
||||
if (raw):
|
||||
fname = 'page%04d.svg' % id
|
||||
else:
|
||||
fname = 'page%04d.xhtml' % id
|
||||
tochtml += '<h3><a href="'+ fname + '">' + title + '</a></h3>\n'
|
||||
tochtml += '</body>\n'
|
||||
tochtml += '</html>\n'
|
||||
file(os.path.join(svgDir, 'toc.xhtml'), 'wb').write(tochtml)
|
||||
|
||||
|
||||
# now create index_svg.xhtml that points to all required files
|
||||
svgindex = '<?xml version="1.0" encoding="utf-8"?>\n'
|
||||
svgindex += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
|
||||
svgindex += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >'
|
||||
@@ -450,50 +551,42 @@ def generateBook(bookDir, raw, fixedimage):
|
||||
svgindex += '</head>\n'
|
||||
svgindex += '<body>\n'
|
||||
|
||||
filenames = os.listdir(pageDir)
|
||||
filenames = sorted(filenames)
|
||||
numfiles = len(filenames)
|
||||
counter = 0
|
||||
|
||||
for filename in filenames:
|
||||
# print ' ', filename
|
||||
print ".",
|
||||
|
||||
fname = os.path.join(pageDir,filename)
|
||||
flat_xml = convert2xml.fromData(dict, fname)
|
||||
|
||||
xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
|
||||
file(xname, 'wb').write(convert2xml.getXML(dict, fname))
|
||||
|
||||
# first get the html
|
||||
htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir, gd, fixedimage)
|
||||
|
||||
# now get the svg image of the page
|
||||
svgxml = flatxml2svg.convert2SVG(gd, flat_xml, counter, numfiles, svgDir, raw, meta_array, scaledpi)
|
||||
|
||||
print "Building svg images of each book page"
|
||||
svgindex += '<h2>List of Pages</h2>\n'
|
||||
svgindex += '<div>\n'
|
||||
idlst = sorted(pageIDMap.keys())
|
||||
numids = len(idlst)
|
||||
cnt = len(idlst)
|
||||
previd = None
|
||||
for j in range(cnt):
|
||||
pageid = idlst[j]
|
||||
if j < cnt - 1:
|
||||
nextid = idlst[j+1]
|
||||
else:
|
||||
nextid = None
|
||||
print '.',
|
||||
pagelst = pageIDMap[pageid]
|
||||
flat_svg = ''
|
||||
for page in pagelst:
|
||||
flat_svg += xmllst[page]
|
||||
svgxml = flatxml2svg.convert2SVG(gd, flat_svg, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi)
|
||||
if (raw) :
|
||||
pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w')
|
||||
svgindex += '<a href="svg/page%04d.svg">Page %d</a>\n' % (counter, counter)
|
||||
pfile = open(os.path.join(svgDir,'page%04d.svg' % pageid),'w')
|
||||
svgindex += '<a href="svg/page%04d.svg">Page %d</a>\n' % (pageid, pageid)
|
||||
else :
|
||||
pfile = open(os.path.join(svgDir,'page%04d.xhtml' % counter), 'w')
|
||||
svgindex += '<a href="svg/page%04d.xhtml">Page %d</a>\n' % (counter, counter)
|
||||
|
||||
|
||||
pfile = open(os.path.join(svgDir,'page%04d.xhtml' % pageid), 'w')
|
||||
svgindex += '<a href="svg/page%04d.xhtml">Page %d</a>\n' % (pageid, pageid)
|
||||
previd = pageid
|
||||
pfile.write(svgxml)
|
||||
pfile.close()
|
||||
|
||||
counter += 1
|
||||
|
||||
print " "
|
||||
|
||||
# finish up the html string and output it
|
||||
htmlstr += '</body>\n</html>\n'
|
||||
file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
|
||||
|
||||
# finish up the svg index string and output it
|
||||
svgindex += '</div>\n'
|
||||
svgindex += '<h2><a href="svg/toc.xhtml">Table of Contents</a></h2>\n'
|
||||
svgindex += '</body>\n</html>\n'
|
||||
file(os.path.join(bookDir, 'index_svg.xhtml'), 'wb').write(svgindex)
|
||||
|
||||
print " "
|
||||
|
||||
# build the opf file
|
||||
opfname = os.path.join(bookDir, 'book.opf')
|
||||
opfstr = '<?xml version="1.0" encoding="utf-8"?>\n'
|
||||
@@ -573,7 +666,7 @@ def main(argv):
|
||||
return 1
|
||||
|
||||
raw = 0
|
||||
fixedimage = False
|
||||
fixedimage = True
|
||||
for o, a in opts:
|
||||
if o =="-h":
|
||||
usage()
|
||||
|
||||
@@ -17,7 +17,7 @@ from __future__ import with_statement
|
||||
# and many many others
|
||||
|
||||
|
||||
__version__ = '3.7'
|
||||
__version__ = '3.9'
|
||||
|
||||
class Unbuffered:
|
||||
def __init__(self, stream):
|
||||
@@ -32,6 +32,7 @@ import sys
|
||||
import os, csv, getopt
|
||||
import string
|
||||
import re
|
||||
import traceback
|
||||
|
||||
class DrmException(Exception):
|
||||
pass
|
||||
@@ -95,8 +96,14 @@ def decryptBook(infile, outdir, k4, kInfoFiles, serials, pids):
|
||||
print "Processing Book: ", title
|
||||
filenametitle = cleanup_name(title)
|
||||
outfilename = bookname
|
||||
if len(bookname)>4 and len(filenametitle)>4 and bookname[:4] != filenametitle[:4]:
|
||||
if len(outfilename)<=8 or len(filenametitle)<=8:
|
||||
outfilename = outfilename + "_" + filenametitle
|
||||
elif outfilename[:8] != filenametitle[:8]:
|
||||
outfilename = outfilename[:8] + "_" + filenametitle
|
||||
|
||||
# avoid excessively long file names
|
||||
if len(outfilename)>150:
|
||||
outfilename = outfilename[:150]
|
||||
|
||||
# build pid list
|
||||
md1, md2 = mb.getPIDMetaInfo()
|
||||
@@ -128,8 +135,8 @@ def decryptBook(infile, outdir, k4, kInfoFiles, serials, pids):
|
||||
zipname = os.path.join(outdir, outfilename + '_nodrm' + '.htmlz')
|
||||
mb.getHTMLZip(zipname)
|
||||
|
||||
print " Creating SVG HTMLZ Archive"
|
||||
zipname = os.path.join(outdir, outfilename + '_SVG' + '.htmlz')
|
||||
print " Creating SVG ZIP Archive"
|
||||
zipname = os.path.join(outdir, outfilename + '_SVG' + '.zip')
|
||||
mb.getSVGZip(zipname)
|
||||
|
||||
print " Creating XML ZIP Archive"
|
||||
|
||||
@@ -81,6 +81,14 @@ class DocParser(object):
|
||||
pos = foundpos + 1
|
||||
return startpos
|
||||
|
||||
# returns a vector of integers for the tagpath
|
||||
def getData(self, tagpath, pos, end):
|
||||
argres=[]
|
||||
(foundat, argt) = self.findinDoc(tagpath, pos, end)
|
||||
if (argt != None) and (len(argt) > 0) :
|
||||
argList = argt.split('|')
|
||||
argres = [ int(strval) for strval in argList]
|
||||
return argres
|
||||
|
||||
def process(self):
|
||||
|
||||
@@ -237,7 +245,11 @@ def convert2CSS(flatxml, fontsize, ph, pw):
|
||||
|
||||
# create a document parser
|
||||
dp = DocParser(flatxml, fontsize, ph, pw)
|
||||
|
||||
csspage = dp.process()
|
||||
|
||||
return csspage
|
||||
|
||||
|
||||
def getpageIDMap(flatxml):
|
||||
dp = DocParser(flatxml, 0, 0, 0)
|
||||
pageidnumbers = dp.getData('info.original.pid', 0, -1)
|
||||
return pageidnumbers
|
||||
|
||||
@@ -140,6 +140,7 @@ class TopazBook:
|
||||
def __init__(self, filename):
|
||||
self.fo = file(filename, 'rb')
|
||||
self.outdir = tempfile.mkdtemp()
|
||||
# self.outdir = 'rawdat'
|
||||
self.bookPayloadOffset = 0
|
||||
self.bookHeaderRecords = {}
|
||||
self.bookMetadata = {}
|
||||
@@ -370,7 +371,8 @@ class TopazBook:
|
||||
|
||||
def cleanup(self):
|
||||
if os.path.isdir(self.outdir):
|
||||
shutil.rmtree(self.outdir, True)
|
||||
pass
|
||||
# shutil.rmtree(self.outdir, True)
|
||||
|
||||
def usage(progname):
|
||||
print "Removes DRM protection from Topaz ebooks and extract the contents"
|
||||
@@ -438,7 +440,7 @@ def main(argv=sys.argv):
|
||||
tb.getHTMLZip(zipname)
|
||||
|
||||
print " Creating SVG ZIP Archive"
|
||||
zipname = os.path.join(outdir, bookname + '_SVG' + '.htmlz')
|
||||
zipname = os.path.join(outdir, bookname + '_SVG' + '.zip')
|
||||
tb.getSVGZip(zipname)
|
||||
|
||||
print " Creating XML ZIP Archive"
|
||||
@@ -450,12 +452,12 @@ def main(argv=sys.argv):
|
||||
|
||||
except TpzDRMError, e:
|
||||
print str(e)
|
||||
tb.cleanup()
|
||||
# tb.cleanup()
|
||||
return 1
|
||||
|
||||
except Exception, e:
|
||||
print str(e)
|
||||
tb.cleanup
|
||||
# tb.cleanup
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
Reference in New Issue
Block a user