tools v4.8

2011-10-28 07:24:15 +01:00
parent e95ed1a8ed
commit 93f02c625a
43 changed files with 1785 additions and 418 deletions
--- a/DeDRM_Windows_Application/DeDRM_WinApp/DeDRM_lib/lib/convert2xml.py
+++ b/DeDRM_Windows_Application/DeDRM_WinApp/DeDRM_lib/lib/convert2xml.py
@@ -20,6 +20,8 @@ import getopt
 from struct import pack
 from struct import unpack

+class TpzDRMError(Exception):
+    pass

 # Get a 7 bit encoded number from string. The most 
 # significant byte comes first and has the high bit (8th) set
@@ -138,7 +140,8 @@ class Dictionary(object):
            return self.stable[self.pos]
        else:
            print "Error - %d outside of string table limits" % val
-            sys.exit(-1)
+            raise TpzDRMError('outside of string table limits')
+            # sys.exit(-1)

    def getSize(self):
        return self.size
@@ -258,6 +261,11 @@ class PageParser(object):
        'paragraph.class'     : (1, 'scalar_text', 0, 0),
        'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
        'paragraph.lastWord'  : (1, 'scalar_number', 0, 0),
+        'paragraph.lastWord'  : (1, 'scalar_number', 0, 0),
+        'paragraph.gridSize'  : (1, 'scalar_number', 0, 0),
+        'paragraph.gridBottomCenter'  : (1, 'scalar_number', 0, 0),
+        'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0),
+

        'word_semantic'           : (1, 'snippets', 1, 1),
        'word_semantic.type'      : (1, 'scalar_text', 0, 0),
@@ -272,11 +280,17 @@ class PageParser(object):

        '_span'           : (1, 'snippets', 1, 0),
        '_span.firstWord' : (1, 'scalar_number', 0, 0),
-        '-span.lastWord'  : (1, 'scalar_number', 0, 0),
+        '_span.lastWord'  : (1, 'scalar_number', 0, 0),
+        '_span.gridSize'  : (1, 'scalar_number', 0, 0),
+        '_span.gridBottomCenter'  : (1, 'scalar_number', 0, 0),
+        '_span.gridTopCenter' : (1, 'scalar_number', 0, 0),

        'span'           : (1, 'snippets', 1, 0),
        'span.firstWord' : (1, 'scalar_number', 0, 0),
        'span.lastWord'  : (1, 'scalar_number', 0, 0),
+        'span.gridSize'  : (1, 'scalar_number', 0, 0),
+        'span.gridBottomCenter'  : (1, 'scalar_number', 0, 0),
+        'span.gridTopCenter' : (1, 'scalar_number', 0, 0),

        'extratokens'            : (1, 'snippets', 1, 0),
        'extratokens.type'       : (1, 'scalar_text', 0, 0),
--- a/DeDRM_Windows_Application/DeDRM_WinApp/DeDRM_lib/lib/erdr2pml.py
+++ b/DeDRM_Windows_Application/DeDRM_WinApp/DeDRM_lib/lib/erdr2pml.py
@@ -59,8 +59,11 @@
 #  0.18 - on Windows try PyCrypto first and OpenSSL next
 #  0.19 - Modify the interface to allow use of import
 #  0.20 - modify to allow use inside new interface for calibre plugins
+#  0.21 - Support eReader (drm) version 11. 
+#       - Don't reject dictionary format. 
+#       - Ignore sidebars for dictionaries (different format?)

-__version__='0.20'
+__version__='0.21'

 class Unbuffered:
    def __init__(self, stream):
@@ -140,12 +143,18 @@ logging.basicConfig()


 class Sectionizer(object):
+    bkType = "Book"
+
    def __init__(self, filename, ident):
        self.contents = file(filename, 'rb').read()
        self.header = self.contents[0:72]
        self.num_sections, = struct.unpack('>H', self.contents[76:78])
+        # Dictionary or normal content (TODO: Not hard-coded)
        if self.header[0x3C:0x3C+8] != ident:
-            raise ValueError('Invalid file format')
+            if self.header[0x3C:0x3C+8] == "PDctPPrs":
+                self.bkType = "Dict"
+            else:
+                raise ValueError('Invalid file format')
        self.sections = []
        for i in xrange(self.num_sections):
            offset, a1,a2,a3,a4 = struct.unpack('>LBBBB', self.contents[78+i*8:78+i*8+8])
@@ -182,15 +191,15 @@ def deXOR(text, sp, table):
    return r

 class EreaderProcessor(object):
-    def __init__(self, section_reader, username, creditcard):
-        self.section_reader = section_reader
-        data = section_reader(0)
+    def __init__(self, sect, username, creditcard):
+        self.section_reader = sect.loadSection
+        data = self.section_reader(0)
        version,  = struct.unpack('>H', data[0:2])
        self.version = version
        logging.info('eReader file format version %s', version)
        if version != 272 and version != 260 and version != 259:
            raise ValueError('incorrect eReader version %d (error 1)' % version)
-        data = section_reader(1)
+        data = self.section_reader(1)
        self.data = data
        des = Des(fixKey(data[0:8]))
        cookie_shuf, cookie_size = struct.unpack('>LL', des.decrypt(data[-8:]))
@@ -219,11 +228,17 @@ class EreaderProcessor(object):
        self.num_text_pages = struct.unpack('>H', r[2:4])[0] - 1
        self.num_image_pages = struct.unpack('>H', r[26:26+2])[0]
        self.first_image_page = struct.unpack('>H', r[24:24+2])[0]
+        # Default values
+        self.num_footnote_pages = 0
+        self.num_sidebar_pages = 0
+        self.first_footnote_page = -1
+        self.first_sidebar_page = -1
        if self.version == 272:
            self.num_footnote_pages = struct.unpack('>H', r[46:46+2])[0]
            self.first_footnote_page = struct.unpack('>H', r[44:44+2])[0]
-            self.num_sidebar_pages = struct.unpack('>H', r[38:38+2])[0]
-            self.first_sidebar_page = struct.unpack('>H', r[36:36+2])[0]
+            if (sect.bkType == "Book"):
+                self.num_sidebar_pages = struct.unpack('>H', r[38:38+2])[0]
+                self.first_sidebar_page = struct.unpack('>H', r[36:36+2])[0]
            # self.num_bookinfo_pages = struct.unpack('>H', r[34:34+2])[0]
            # self.first_bookinfo_page = struct.unpack('>H', r[32:32+2])[0]
            # self.num_chapter_pages = struct.unpack('>H', r[22:22+2])[0]
@@ -239,10 +254,8 @@ class EreaderProcessor(object):
            self.xortable_size = struct.unpack('>H', r[42:42+2])[0]
            self.xortable = self.data[self.xortable_offset:self.xortable_offset + self.xortable_size]
        else:
-            self.num_footnote_pages = 0
-            self.num_sidebar_pages = 0
-            self.first_footnote_page = -1
-            self.first_sidebar_page = -1
+            # Nothing needs to be done
+            pass
            # self.num_bookinfo_pages = 0
            # self.num_chapter_pages = 0
            # self.num_link_pages = 0
@@ -267,10 +280,14 @@ class EreaderProcessor(object):
            encrypted_key_sha = r[44:44+20]
            encrypted_key = r[64:64+8]
        elif version == 260:
-            if drm_sub_version != 13:
+            if drm_sub_version != 13 and drm_sub_version != 11:
                raise ValueError('incorrect eReader version %d (error 3)' % drm_sub_version)
-            encrypted_key = r[44:44+8]
-            encrypted_key_sha = r[52:52+20]
+            if drm_sub_version == 13:
+                encrypted_key = r[44:44+8]
+                encrypted_key_sha = r[52:52+20]
+            else:
+                encrypted_key = r[64:64+8]
+                encrypted_key_sha = r[44:44+20]
        elif version == 272:
            encrypted_key = r[172:172+8]
            encrypted_key_sha = r[56:56+20]
@@ -356,6 +373,12 @@ class EreaderProcessor(object):
                r += fmarker
                fnote_ids = fnote_ids[id_len+4:]

+        # TODO: Handle dictionary index (?) pages - which are also marked as
+        # sidebar_pages (?). For now dictionary sidebars are ignored
+        # For dictionaries - record 0 is null terminated strings, followed by
+        # blocks of around 62000 bytes and a final block. Not sure of the
+        # encoding
+
        # now handle sidebar pages
        if self.num_sidebar_pages > 0:
            r += '\n'
@@ -368,7 +391,7 @@ class EreaderProcessor(object):
                id_len = ord(sbar_ids[2])
                id = sbar_ids[3:3+id_len]
                smarker = '<sidebar id="%s">\n' % id
-                smarker += zlib.decompress(des.decrypt(self.section_reader(self.first_footnote_page + i)))
+                smarker += zlib.decompress(des.decrypt(self.section_reader(self.first_sidebar_page + i)))
                smarker += '\n</sidebar>\n'
                r += smarker
                sbar_ids = sbar_ids[id_len+4:]
@@ -389,7 +412,7 @@ def convertEreaderToPml(infile, name, cc, outdir):
    bookname = os.path.splitext(os.path.basename(infile))[0]
    print "   Decoding File"
    sect = Sectionizer(infile, 'PNRdPPrs')
-    er = EreaderProcessor(sect.loadSection, name, cc)
+    er = EreaderProcessor(sect, name, cc)

    if er.getNumImages() > 0:
        print "   Extracting images"
--- a/DeDRM_Windows_Application/DeDRM_WinApp/DeDRM_lib/lib/flatxml2html.py
+++ b/DeDRM_Windows_Application/DeDRM_WinApp/DeDRM_lib/lib/flatxml2html.py
@@ -271,6 +271,9 @@ class DocParser(object):

        pclass = self.getClass(pclass)

+        # if paragraph uses extratokens (extra glyphs) then make it fixed
+        (pos, extraglyphs) = self.findinDoc('paragraph.extratokens',start,end)
+
        # build up a description of the paragraph in result and return it
        # first check for the  basic - all words paragraph
        (pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
@@ -280,6 +283,7 @@ class DocParser(object):
            last = int(slast)
            
            makeImage = (regtype == 'vertical') or (regtype == 'table')
+            makeImage = makeImage or (extraglyphs != None) 
            if self.fixedimage:
                makeImage = makeImage or (regtype == 'fixed')

@@ -353,6 +357,8 @@ class DocParser(object):

        word_class = ''

+        word_semantic_type = ''
+
        while (line < end) :

            (name, argres) = self.lineinDoc(line)
@@ -512,6 +518,72 @@ class DocParser(object):
        return parares


+    def buildTOCEntry(self, pdesc) :
+        parares = ''
+        sep =''
+        tocentry = ''
+        handle_links = len(self.link_id) > 0
+
+        lstart = 0
+
+        cnt = len(pdesc)
+        for j in xrange( 0, cnt) :
+
+            (wtype, num) = pdesc[j]
+
+            if wtype == 'ocr' :
+                word = self.ocrtext[num]
+                sep = ' '
+
+                if handle_links:
+                    link = self.link_id[num]
+                    if (link > 0):
+                        linktype = self.link_type[link-1]
+                        title = self.link_title[link-1]
+                        title = title.rstrip('. ')
+                        alt_title = parares[lstart:]
+                        alt_title = alt_title.strip()
+                        # now strip off the actual printed page number
+                        alt_title = alt_title.rstrip('01234567890ivxldIVXLD-.')
+                        alt_title = alt_title.rstrip('. ')
+                        # skip over any external links - can't have them in a books toc
+                        if linktype == 'external' :
+                            title = ''
+                            alt_title = ''
+                            linkpage = ''
+                        else : 
+                            if len(self.link_page) >= link :
+                                ptarget = self.link_page[link-1] - 1
+                                linkpage = '%04d' % ptarget
+                            else :
+                                # just link to the current page
+                                linkpage = self.id[4:]
+                        if len(alt_title) >= len(title):
+                            title = alt_title
+                        if title != '' and linkpage != '':
+                            tocentry += title + '|' + linkpage + '\n'
+                        lstart = len(parares)
+                        if word == '_link_' : word = ''
+                    elif (link < 0) :
+                        if word == '_link_' : word = ''
+
+                if word == '_lb_':
+                    word = ''
+                    sep = ''
+
+                if num in self.dehyphen_rootid :
+                    word = word[0:-1]
+                    sep = ''
+
+                parares += word + sep
+
+            else :
+                continue
+
+        return tocentry
+
+
+
    
    # walk the document tree collecting the information needed
    # to build an html page using the ocrText
@@ -519,6 +591,7 @@ class DocParser(object):
    def process(self):

        htmlpage = ''
+        tocinfo = ''

        # get the ocr text
        (pos, argres) = self.findinDoc('info.word.ocrText',0,-1)
@@ -644,9 +717,9 @@ class DocParser(object):
                        ptype = 'end'
                        first_para_continued = False
                    (pclass, pdesc) = self.getParaDescription(start,end, regtype)
+                    tocinfo += self.buildTOCEntry(pdesc)
                    htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)

-
                elif (regtype == 'vertical') or (regtype == 'table') :
                    ptype = 'full'
                    if inGroup:
@@ -704,12 +777,11 @@ class DocParser(object):
                htmlpage = htmlpage[0:-4]
            last_para_continued = False

-        return htmlpage
-
+        return htmlpage, tocinfo


 def convert2HTML(flatxml, classlst, fileid, bookDir, gdict, fixedimage):
    # create a document parser
    dp = DocParser(flatxml, classlst, fileid, bookDir, gdict, fixedimage)
-    htmlpage = dp.process()
-    return htmlpage
+    htmlpage, tocinfo = dp.process()
+    return htmlpage, tocinfo
--- a/DeDRM_Windows_Application/DeDRM_WinApp/DeDRM_lib/lib/flatxml2svg.py
+++ b/DeDRM_Windows_Application/DeDRM_WinApp/DeDRM_lib/lib/flatxml2svg.py
@@ -10,17 +10,94 @@ from struct import unpack


 class PParser(object):
-    def __init__(self, gd, flatxml):
+    def __init__(self, gd, flatxml, meta_array):
        self.gd = gd
        self.flatdoc = flatxml.split('\n')
+        self.docSize = len(self.flatdoc)
        self.temp = []
-        foo = self.getData('page.h') or self.getData('book.h')
-        self.ph = foo[0]
-        foo = self.getData('page.w') or self.getData('book.w')
-        self.pw = foo[0]
-        self.gx = self.getData('info.glyph.x')
-        self.gy = self.getData('info.glyph.y')
-        self.gid = self.getData('info.glyph.glyphID')
+        
+        self.ph = -1
+        self.pw = -1
+        startpos = self.posinDoc('page.h') or self.posinDoc('book.h')
+        for p in startpos:
+            (name, argres) = self.lineinDoc(p)
+            self.ph = max(self.ph, int(argres))
+        startpos = self.posinDoc('page.w') or self.posinDoc('book.w')
+        for p in startpos:
+            (name, argres) = self.lineinDoc(p)
+            self.pw = max(self.pw, int(argres))
+        
+        if self.ph <= 0:
+            self.ph = int(meta_array.get('pageHeight', '11000'))
+        if self.pw <= 0:
+            self.pw = int(meta_array.get('pageWidth', '8500'))
+
+        res = []
+        startpos = self.posinDoc('info.glyph.x')
+        for p in startpos:
+            argres = self.getDataatPos('info.glyph.x', p)
+            res.extend(argres)
+        self.gx = res
+
+        res = []
+        startpos = self.posinDoc('info.glyph.y')
+        for p in startpos:
+            argres = self.getDataatPos('info.glyph.y', p)
+            res.extend(argres)
+        self.gy = res
+
+        res = []
+        startpos = self.posinDoc('info.glyph.glyphID')
+        for p in startpos:
+            argres = self.getDataatPos('info.glyph.glyphID', p)
+            res.extend(argres)
+        self.gid = res
+
+
+    # return tag at line pos in document
+    def lineinDoc(self, pos) :
+        if (pos >= 0) and (pos < self.docSize) :
+            item = self.flatdoc[pos]
+            if item.find('=') >= 0:
+                (name, argres) = item.split('=',1)
+            else :
+                name = item
+                argres = ''
+        return name, argres
+
+    # find tag in doc if within pos to end inclusive
+    def findinDoc(self, tagpath, pos, end) :
+        result = None
+        if end == -1 :
+            end = self.docSize
+        else:
+            end = min(self.docSize, end)
+        foundat = -1
+        for j in xrange(pos, end):
+            item = self.flatdoc[j]
+            if item.find('=') >= 0:
+                (name, argres) = item.split('=',1)
+            else :
+                name = item
+                argres = ''
+            if name.endswith(tagpath) :
+                result = argres
+                foundat = j
+                break
+        return foundat, result
+
+    # return list of start positions for the tagpath
+    def posinDoc(self, tagpath):
+        startpos = []
+        pos = 0
+        res = ""
+        while res != None :
+            (foundpos, res) = self.findinDoc(tagpath, pos, -1)
+            if res != None :
+                startpos.append(foundpos)
+            pos = foundpos + 1
+        return startpos
+
    def getData(self, path):
        result = None
        cnt = len(self.flatdoc)
@@ -39,6 +116,23 @@ class PParser(object):
            for j in xrange(0,len(argres)):
                argres[j] = int(argres[j])
        return result
+
+    def getDataatPos(self, path, pos):
+        result = None
+        item = self.flatdoc[pos]
+        if item.find('=') >= 0:
+            (name, argt) = item.split('=')
+            argres = argt.split('|')
+        else:
+            name = item
+            argres = []
+        if (len(argres) > 0) :
+            for j in xrange(0,len(argres)):
+                argres[j] = int(argres[j])
+        if (name.endswith(path)):
+            result = argres
+        return result
+
    def getDataTemp(self, path):
        result = None
        cnt = len(self.temp)
@@ -58,6 +152,7 @@ class PParser(object):
            for j in xrange(0,len(argres)):
                argres[j] = int(argres[j])
        return result
+
    def getImages(self):
        result = []
        self.temp = self.flatdoc
@@ -69,6 +164,7 @@ class PParser(object):
            src = self.getDataTemp('img.src')[0]
            result.append('<image xlink:href="../img/img%04d.jpg" x="%d" y="%d" width="%d" height="%d" />\n' % (src, x, y, w, h))
        return result
+
    def getGlyphs(self):
        result = []
        if (self.gid != None) and (len(self.gid) > 0):
@@ -84,25 +180,25 @@ class PParser(object):
        return result


-def convert2SVG(gdict, flat_xml, counter, numfiles, svgDir, raw, meta_array, scaledpi):
+def convert2SVG(gdict, flat_xml, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi):
    ml = ''
-    pp = PParser(gdict, flat_xml)
+    pp = PParser(gdict, flat_xml, meta_array)
    ml += '<?xml version="1.0" standalone="no"?>\n'
    if (raw):
        ml += '<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n'
        ml += '<svg width="%fin" height="%fin" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1)
-        ml += '<title>Page %d - %s by %s</title>\n' % (counter, meta_array['Title'],meta_array['Authors'])
+        ml += '<title>Page %d - %s by %s</title>\n' % (pageid, meta_array['Title'],meta_array['Authors'])
    else:
        ml += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
        ml += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" ><head>\n'
-        ml += '<title>Page %d - %s by %s</title>\n' % (counter, meta_array['Title'],meta_array['Authors'])
+        ml += '<title>Page %d - %s by %s</title>\n' % (pageid, meta_array['Title'],meta_array['Authors'])
        ml += '<script><![CDATA[\n'
        ml += 'function gd(){var p=window.location.href.replace(/^.*\?dpi=(\d+).*$/i,"$1");return p;}\n'
        ml += 'var dpi=%d;\n' % scaledpi
-        if (counter) :
-            ml += 'var prevpage="page%04d.xhtml";\n' % (counter - 1)
-        if (counter < numfiles-1) :
-            ml += 'var nextpage="page%04d.xhtml";\n' % (counter + 1)
+        if (previd) :
+            ml += 'var prevpage="page%04d.xhtml";\n' % (previd)
+        if (nextid) :
+            ml += 'var nextpage="page%04d.xhtml";\n' % (nextid)
        ml += 'var pw=%d;var ph=%d;' % (pp.pw, pp.ph)
        ml += 'function zoomin(){dpi=dpi*(0.8);setsize();}\n'
        ml += 'function zoomout(){dpi=dpi*1.25;setsize();}\n'
@@ -115,10 +211,11 @@ def convert2SVG(gdict, flat_xml, counter, numfiles, svgDir, raw, meta_array, sca
        ml += '</head>\n'
        ml += '<body onLoad="setsize();" style="background-color:#777;text-align:center;">\n'
        ml += '<div style="white-space:nowrap;">\n'
-        if (counter == 0) :
+        if previd == None:
            ml += '<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n'
        else:
            ml += '<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,150,95,5,95,295" fill="#AAAAAA" /></svg></a>\n'
+        
        ml += '<a href="javascript:npage();"><svg id="svgimg" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" style="background-color:#FFF;border:1px solid black;">' % (pp.pw, pp.ph)
    if (pp.gid != None): 
        ml += '<defs>\n'
@@ -134,12 +231,14 @@ def convert2SVG(gdict, flat_xml, counter, numfiles, svgDir, raw, meta_array, sca
        for j in xrange(0,len(pp.gid)):
            ml += '<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (pp.gid[j], pp.gx[j], pp.gy[j])
    if (img == None or len(img) == 0) and (pp.gid == None or len(pp.gid) == 0):
-        ml += '<text x="10" y="10" font-family="Helvetica" font-size="100" stroke="black">This page intentionally left blank.</text>\n<text x="10" y="110" font-family="Helvetica" font-size="50" stroke="black">Until this notice unintentionally gave it content.  (gensvg.py)</text>\n'
+        xpos = "%d" % (pp.pw // 3)
+        ypos = "%d" % (pp.ph // 3)
+        ml += '<text x="' + xpos + '" y="' + ypos + '" font-size="' + meta_array['fontSize'] + '" font-family="Helvetica" stroke="black">This page intentionally left blank.</text>\n'
    if (raw) :
        ml += '</svg>'
    else :
        ml += '</svg></a>\n'
-        if (counter == numfiles - 1) :
+        if nextid == None:
            ml += '<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n'
        else :
            ml += '<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,5,5,295,95,150" fill="#AAAAAA" /></svg></a>\n'
--- a/DeDRM_Windows_Application/DeDRM_WinApp/DeDRM_lib/lib/genbook.py
+++ b/DeDRM_Windows_Application/DeDRM_WinApp/DeDRM_lib/lib/genbook.py
@@ -19,6 +19,8 @@ import getopt
 from struct import pack
 from struct import unpack

+class TpzDRMError(Exception):
+    pass

 # local support routines
 if 'calibre' in sys.modules:
@@ -114,7 +116,8 @@ class Dictionary(object):
            return self.stable[self.pos]
        else:
            print "Error - %d outside of string table limits" % val
-            sys.exit(-1)
+            raise TpzDRMError('outside or string table limits')
+            # sys.exit(-1)
    def getSize(self):
        return self.size
    def getPos(self):
@@ -371,10 +374,34 @@ def generateBook(bookDir, raw, fixedimage):
    (ph, pw) = getPageDim(flat_xml)
    if (ph == '-1') or (ph == '0') : ph = '11000'
    if (pw == '-1') or (pw == '0') : pw = '8500'
+    meta_array['pageHeight'] = ph
+    meta_array['pageWidth'] = pw
+    if 'fontSize' not in meta_array.keys():
+        meta_array['fontSize'] = fontsize

-    # print '     ', 'other0000.dat'
+    # process other.dat for css info and for map of page files to svg images
+    # this map is needed because some pages actually are made up of multiple
+    # pageXXXX.xml files
    xname = os.path.join(bookDir, 'style.css')
    flat_xml = convert2xml.fromData(dict, otherFile)
+
+    # extract info.original.pid to get original page information
+    pageIDMap = {}
+    pageidnums = stylexml2css.getpageIDMap(flat_xml)
+    if len(pageidnums) == 0:
+        filenames = os.listdir(pageDir)
+        numfiles = len(filenames)
+        for k in range(numfiles):
+            pageidnums.append(k)
+    # create a map from page ids to list of page file nums to process for that page
+    for i in range(len(pageidnums)):
+        id = pageidnums[i]
+        if id in pageIDMap.keys():
+            pageIDMap[id].append(i)
+        else:
+            pageIDMap[id] = [i]
+
+    # now get the css info
    cssstr , classlst = stylexml2css.convert2CSS(flat_xml, fontsize, ph, pw)
    file(xname, 'wb').write(cssstr)
    xname = os.path.join(xmlDir, 'other0000.xml')
@@ -414,6 +441,9 @@ def generateBook(bookDir, raw, fixedimage):
    glyfile.close()
    print " "

+    # build up tocentries while processing html
+    tocentries = ''
+
    # start up the html
    htmlFileName = "book.html"
    htmlstr = '<?xml version="1.0" encoding="utf-8"?>\n'
@@ -436,6 +466,77 @@ def generateBook(bookDir, raw, fixedimage):
    # readability when rendering to the screen.  
    scaledpi = 1440.0

+    filenames = os.listdir(pageDir)
+    filenames = sorted(filenames)
+    numfiles = len(filenames)
+
+    xmllst = []
+
+    for filename in filenames:
+        # print '     ', filename
+        print ".",
+        fname = os.path.join(pageDir,filename)
+        flat_xml = convert2xml.fromData(dict, fname)
+
+        # keep flat_xml for later svg processing
+        xmllst.append(flat_xml)
+
+        xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
+        file(xname, 'wb').write(convert2xml.getXML(dict, fname))
+
+        # first get the html
+        pagehtml, tocinfo = flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir, gd, fixedimage)
+        tocentries += tocinfo 
+        htmlstr += pagehtml
+
+    # finish up the html string and output it
+    htmlstr += '</body>\n</html>\n'
+    file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
+    
+    print " "
+    print 'Extracting Table of Contents from Amazon OCR'
+
+    # first create a table of contents file for the svg images
+    tochtml = '<?xml version="1.0" encoding="utf-8"?>\n'
+    tochtml += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
+    tochtml += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >'
+    tochtml += '<head>\n'
+    tochtml += '<title>' + meta_array['Title'] + '</title>\n'
+    tochtml += '<meta name="Author" content="' + meta_array['Authors'] + '" />\n'
+    tochtml += '<meta name="Title" content="' + meta_array['Title'] + '" />\n'
+    if 'ASIN' in meta_array:
+        tochtml += '<meta name="ASIN" content="' + meta_array['ASIN'] + '" />\n'
+    if 'GUID' in meta_array:
+        tochtml += '<meta name="GUID" content="' + meta_array['GUID'] + '" />\n'
+    tochtml += '</head>\n'
+    tochtml += '<body>\n'
+
+    tochtml += '<h2>Table of Contents</h2>\n'
+    start = pageidnums[0]
+    if (raw):
+        startname = 'page%04d.svg' % start
+    else:
+        startname = 'page%04d.xhtml' % start
+
+    tochtml += '<h3><a href="' + startname + '">Start of Book</a></h3>\n'
+    # build up a table of contents for the svg xhtml output
+    toclst = tocentries.split('\n')
+    toclst.pop()
+    for entry in toclst:
+        print entry
+        title, pagenum = entry.split('|')
+        id = pageidnums[int(pagenum)]
+        if (raw):
+            fname = 'page%04d.svg' % id
+        else:
+            fname = 'page%04d.xhtml' % id
+        tochtml += '<h3><a href="'+ fname + '">' + title + '</a></h3>\n'
+    tochtml += '</body>\n'
+    tochtml += '</html>\n'
+    file(os.path.join(svgDir, 'toc.xhtml'), 'wb').write(tochtml)
+
+
+    # now create index_svg.xhtml that points to all required files
    svgindex = '<?xml version="1.0" encoding="utf-8"?>\n'
    svgindex += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
    svgindex += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >'
@@ -450,50 +551,42 @@ def generateBook(bookDir, raw, fixedimage):
    svgindex += '</head>\n'
    svgindex += '<body>\n'

-    filenames = os.listdir(pageDir)
-    filenames = sorted(filenames)
-    numfiles = len(filenames)
-    counter = 0
-
-    for filename in filenames:
-        # print '     ', filename
-        print ".",
-
-        fname = os.path.join(pageDir,filename)
-        flat_xml = convert2xml.fromData(dict, fname)
-
-        xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
-        file(xname, 'wb').write(convert2xml.getXML(dict, fname))
-
-        # first get the html
-        htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir, gd, fixedimage)
-
-        # now get the svg image of the page
-        svgxml = flatxml2svg.convert2SVG(gd, flat_xml, counter, numfiles, svgDir, raw, meta_array, scaledpi)
-
+    print "Building svg images of each book page"
+    svgindex += '<h2>List of Pages</h2>\n'
+    svgindex += '<div>\n'
+    idlst = sorted(pageIDMap.keys())
+    numids = len(idlst)
+    cnt = len(idlst)
+    previd = None
+    for j in range(cnt):
+        pageid = idlst[j]
+        if j < cnt - 1:
+            nextid = idlst[j+1]
+        else:
+            nextid = None
+        print '.',
+        pagelst = pageIDMap[pageid]
+        flat_svg = ''
+        for page in pagelst:
+            flat_svg += xmllst[page]
+        svgxml = flatxml2svg.convert2SVG(gd, flat_svg, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi)
        if (raw) :
-            pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w')
-            svgindex += '<a href="svg/page%04d.svg">Page %d</a>\n' % (counter, counter)
+            pfile = open(os.path.join(svgDir,'page%04d.svg' % pageid),'w')
+            svgindex += '<a href="svg/page%04d.svg">Page %d</a>\n' % (pageid, pageid)
        else :
-            pfile = open(os.path.join(svgDir,'page%04d.xhtml' % counter), 'w')
-            svgindex += '<a href="svg/page%04d.xhtml">Page %d</a>\n' % (counter, counter)
-
-
+            pfile = open(os.path.join(svgDir,'page%04d.xhtml' % pageid), 'w')
+            svgindex += '<a href="svg/page%04d.xhtml">Page %d</a>\n' % (pageid, pageid)
+        previd = pageid
        pfile.write(svgxml)
        pfile.close()
-
        counter += 1
-
-    print " "
-
-    # finish up the html string and output it
-    htmlstr += '</body>\n</html>\n'
-    file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
-
-    # finish up the svg index string and output it
+    svgindex += '</div>\n'
+    svgindex += '<h2><a href="svg/toc.xhtml">Table of Contents</a></h2>\n'
    svgindex += '</body>\n</html>\n'
    file(os.path.join(bookDir, 'index_svg.xhtml'), 'wb').write(svgindex)

+    print " "
+
    # build the opf file
    opfname = os.path.join(bookDir, 'book.opf')
    opfstr = '<?xml version="1.0" encoding="utf-8"?>\n'
@@ -573,7 +666,7 @@ def main(argv):
        return 1 

    raw = 0
-    fixedimage = False
+    fixedimage = True
    for o, a in opts:
        if o =="-h":
            usage()
--- a/DeDRM_Windows_Application/DeDRM_WinApp/DeDRM_lib/lib/k4mobidedrm.py
+++ b/DeDRM_Windows_Application/DeDRM_WinApp/DeDRM_lib/lib/k4mobidedrm.py
@@ -17,7 +17,7 @@ from __future__ import with_statement
 #    and many many others


-__version__ = '3.7'
+__version__ = '3.9'

 class Unbuffered:
    def __init__(self, stream):
@@ -32,6 +32,7 @@ import sys
 import os, csv, getopt
 import string
 import re
+import traceback

 class DrmException(Exception):
    pass
@@ -95,8 +96,14 @@ def decryptBook(infile, outdir, k4, kInfoFiles, serials, pids):
    print "Processing Book: ", title
    filenametitle = cleanup_name(title)
    outfilename = bookname
-    if len(bookname)>4 and len(filenametitle)>4 and bookname[:4] != filenametitle[:4]:
+    if len(outfilename)<=8 or len(filenametitle)<=8:
        outfilename = outfilename + "_" + filenametitle
+    elif outfilename[:8] != filenametitle[:8]:
+        outfilename = outfilename[:8] + "_" + filenametitle
+        
+    # avoid excessively long file names
+    if len(outfilename)>150:
+        outfilename = outfilename[:150]

    # build pid list
    md1, md2 = mb.getPIDMetaInfo()
@@ -128,8 +135,8 @@ def decryptBook(infile, outdir, k4, kInfoFiles, serials, pids):
    zipname = os.path.join(outdir, outfilename + '_nodrm' + '.htmlz')
    mb.getHTMLZip(zipname)

-    print "   Creating SVG HTMLZ Archive"
-    zipname = os.path.join(outdir, outfilename + '_SVG' + '.htmlz')
+    print "   Creating SVG ZIP Archive"
+    zipname = os.path.join(outdir, outfilename + '_SVG' + '.zip')
    mb.getSVGZip(zipname)

    print "   Creating XML ZIP Archive"
--- a/DeDRM_Windows_Application/DeDRM_WinApp/DeDRM_lib/lib/stylexml2css.py
+++ b/DeDRM_Windows_Application/DeDRM_WinApp/DeDRM_lib/lib/stylexml2css.py
@@ -81,6 +81,14 @@ class DocParser(object):
            pos = foundpos + 1
        return startpos

+    # returns a vector of integers for the tagpath
+    def getData(self, tagpath, pos, end):
+        argres=[]
+        (foundat, argt) = self.findinDoc(tagpath, pos, end)
+        if (argt != None) and (len(argt) > 0) :
+            argList = argt.split('|')
+            argres = [ int(strval) for strval in argList]
+        return argres

    def process(self):

@@ -237,7 +245,11 @@ def convert2CSS(flatxml, fontsize, ph, pw):

    # create a document parser
    dp = DocParser(flatxml, fontsize, ph, pw)
-
    csspage = dp.process()
-
    return csspage
+
+
+def getpageIDMap(flatxml):
+    dp = DocParser(flatxml, 0, 0, 0)
+    pageidnumbers = dp.getData('info.original.pid', 0, -1)
+    return pageidnumbers
--- a/DeDRM_Windows_Application/DeDRM_WinApp/DeDRM_lib/lib/topazextract.py
+++ b/DeDRM_Windows_Application/DeDRM_WinApp/DeDRM_lib/lib/topazextract.py
@@ -140,6 +140,7 @@ class TopazBook:
    def __init__(self, filename):
        self.fo = file(filename, 'rb')
        self.outdir = tempfile.mkdtemp()
+        # self.outdir = 'rawdat'
        self.bookPayloadOffset = 0
        self.bookHeaderRecords = {}
        self.bookMetadata = {}
@@ -370,7 +371,8 @@ class TopazBook:

    def cleanup(self):
        if os.path.isdir(self.outdir):
-            shutil.rmtree(self.outdir, True)
+            pass
+            # shutil.rmtree(self.outdir, True)

 def usage(progname):
    print "Removes DRM protection from Topaz ebooks and extract the contents"
@@ -438,7 +440,7 @@ def main(argv=sys.argv):
        tb.getHTMLZip(zipname)

        print "   Creating SVG ZIP Archive"
-        zipname = os.path.join(outdir, bookname + '_SVG' + '.htmlz')
+        zipname = os.path.join(outdir, bookname + '_SVG' + '.zip')
        tb.getSVGZip(zipname)

        print "   Creating XML ZIP Archive"
@@ -450,12 +452,12 @@ def main(argv=sys.argv):

    except TpzDRMError, e:
        print str(e)
-        tb.cleanup()
+        # tb.cleanup()
        return 1

    except Exception, e:
        print str(e)
-        tb.cleanup
+        # tb.cleanup
        return 1

    return 0