More fixes for Amazon books, fixing identity checks, started on Topaz.

2020-10-16 13:58:59 +01:00
parent dc27c36761
commit 939cdbb0c9
8 changed files with 530 additions and 512 deletions
--- a/DeDRM_plugin/flatxml2html.py
+++ b/DeDRM_plugin/flatxml2html.py
@@ -7,6 +7,7 @@ import csv
 import os
 import math
 import getopt
+import functools
 from struct import pack
 from struct import unpack

@@ -15,14 +16,14 @@ class DocParser(object):
    def __init__(self, flatxml, classlst, fileid, bookDir, gdict, fixedimage):
        self.id = os.path.basename(fileid).replace('.dat','')
        self.svgcount = 0
-        self.docList = flatxml.split('\n')
+        self.docList = flatxml.split(b'\n')
        self.docSize = len(self.docList)
        self.classList = {}
        self.bookDir = bookDir
        self.gdict = gdict
        tmpList = classlst.split('\n')
        for pclass in tmpList:
-            if pclass != '':
+            if pclass != b'':
                # remove the leading period from the css name
                cname = pclass[1:]
            self.classList[cname] = True
@@ -57,9 +58,9 @@ class DocParser(object):
        imgfile = os.path.join(imgDir,imgname)

        # get glyph information
-        gxList = self.getData('info.glyph.x',0,-1)
-        gyList = self.getData('info.glyph.y',0,-1)
-        gidList = self.getData('info.glyph.glyphID',0,-1)
+        gxList = self.getData(b'info.glyph.x',0,-1)
+        gyList = self.getData(b'info.glyph.y',0,-1)
+        gidList = self.getData(b'info.glyph.glyphID',0,-1)

        gids = []
        maxws = []
@@ -122,11 +123,11 @@ class DocParser(object):
    def lineinDoc(self, pos) :
        if (pos >= 0) and (pos < self.docSize) :
            item = self.docList[pos]
-            if item.find('=') >= 0:
-                (name, argres) = item.split('=',1)
+            if item.find(b'=') >= 0:
+                (name, argres) = item.split(b'=',1)
            else :
                name = item
-                argres = ''
+                argres = b''
        return name, argres


@@ -140,11 +141,13 @@ class DocParser(object):
        foundat = -1
        for j in range(pos, end):
            item = self.docList[j]
-            if item.find('=') >= 0:
-                (name, argres) = item.split('=',1)
+            if item.find(b'=') >= 0:
+                (name, argres) = item.split(b'=',1)
            else :
                name = item
                argres = ''
+            if (isinstance(tagpath,str)):
+                tagpath = tagpath.encode('utf-8')
            if name.endswith(tagpath) :
                result = argres
                foundat = j
@@ -170,7 +173,7 @@ class DocParser(object):
        argres=[]
        (foundat, argt) = self.findinDoc(tagpath, pos, end)
        if (argt != None) and (len(argt) > 0) :
-            argList = argt.split('|')
+            argList = argt.split(b'|')
            argres = [ int(strval) for strval in argList]
        return argres

@@ -191,21 +194,21 @@ class DocParser(object):

        # also some class names have spaces in them so need to convert to dashes
        if nclass != None :
-            nclass = nclass.replace(' ','-')
-            classres = ''
+            nclass = nclass.replace(b' ',b'-')
+            classres = b''
            nclass = nclass.lower()
-            nclass = 'cl-' + nclass
-            baseclass = ''
+            nclass = b'cl-' + nclass
+            baseclass = b''
            # graphic is the base class for captions
-            if nclass.find('cl-cap-') >=0 :
-                classres = 'graphic' + ' '
+            if nclass.find(b'cl-cap-') >=0 :
+                classres = b'graphic' + b' '
            else :
                # strip to find baseclass
-                p = nclass.find('_')
+                p = nclass.find(b'_')
                if p > 0 :
                    baseclass = nclass[0:p]
                    if baseclass in self.classList:
-                        classres += baseclass + ' '
+                        classres += baseclass + b' '
            classres += nclass
            nclass = classres
        return nclass
@@ -225,11 +228,11 @@ class DocParser(object):
            return -1

        result = []
-        (pos, pagetype) = self.findinDoc('page.type',0,-1)
+        (pos, pagetype) = self.findinDoc(b'page.type',0,-1)

-        groupList = self.posinDoc('page.group')
-        groupregionList = self.posinDoc('page.group.region')
-        pageregionList = self.posinDoc('page.region')
+        groupList = self.posinDoc(b'page.group')
+        groupregionList = self.posinDoc(b'page.group.region')
+        pageregionList = self.posinDoc(b'page.region')
        # integrate into one list
        for j in groupList:
            result.append(('grpbeg',j))
@@ -237,7 +240,7 @@ class DocParser(object):
            result.append(('gregion',j))
        for j in pageregionList:
            result.append(('pregion',j))
-        result.sort(compare)
+        result.sort(key=functools.cmp_to_key(compare))

        # insert group end and page end indicators
        inGroup = False
@@ -267,33 +270,33 @@ class DocParser(object):
        result = []

        # paragraph
-        (pos, pclass) = self.findinDoc('paragraph.class',start,end)
+        (pos, pclass) = self.findinDoc(b'paragraph.class',start,end)

        pclass = self.getClass(pclass)

        # if paragraph uses extratokens (extra glyphs) then make it fixed
-        (pos, extraglyphs) = self.findinDoc('paragraph.extratokens',start,end)
+        (pos, extraglyphs) = self.findinDoc(b'paragraph.extratokens',start,end)

        # build up a description of the paragraph in result and return it
        # first check for the  basic - all words paragraph
-        (pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
-        (pos, slast) = self.findinDoc('paragraph.lastWord',start,end)
+        (pos, sfirst) = self.findinDoc(b'paragraph.firstWord',start,end)
+        (pos, slast) = self.findinDoc(b'paragraph.lastWord',start,end)
        if (sfirst != None) and (slast != None) :
            first = int(sfirst)
            last = int(slast)

-            makeImage = (regtype == 'vertical') or (regtype == 'table')
+            makeImage = (regtype == b'vertical') or (regtype == b'table')
            makeImage = makeImage or (extraglyphs != None)
            if self.fixedimage:
-                makeImage = makeImage or (regtype == 'fixed')
+                makeImage = makeImage or (regtype == b'fixed')

            if (pclass != None):
-                makeImage = makeImage or (pclass.find('.inverted') >= 0)
+                makeImage = makeImage or (pclass.find(b'.inverted') >= 0)
                if self.fixedimage :
-                    makeImage = makeImage or (pclass.find('cl-f-') >= 0)
+                    makeImage = makeImage or (pclass.find(b'cl-f-') >= 0)

            # before creating an image make sure glyph info exists
-            gidList = self.getData('info.glyph.glyphID',0,-1)
+            gidList = self.getData(b'info.glyph.glyphID',0,-1)

            makeImage = makeImage & (len(gidList) > 0)

@@ -307,8 +310,8 @@ class DocParser(object):
            # translate first and last word into first and last glyphs
            # and generate inline image and include it
            glyphList = []
-            firstglyphList = self.getData('word.firstGlyph',0,-1)
-            gidList = self.getData('info.glyph.glyphID',0,-1)
+            firstglyphList = self.getData(b'word.firstGlyph',0,-1)
+            gidList = self.getData(b'info.glyph.glyphID',0,-1)
            firstGlyph = firstglyphList[first]
            if last < len(firstglyphList):
                lastGlyph = firstglyphList[last]
@@ -326,8 +329,8 @@ class DocParser(object):
            for glyphnum in range(firstGlyph, lastGlyph):
                glyphList.append(glyphnum)
            # include any extratokens if they exist
-            (pos, sfg) = self.findinDoc('extratokens.firstGlyph',start,end)
-            (pos, slg) = self.findinDoc('extratokens.lastGlyph',start,end)
+            (pos, sfg) = self.findinDoc(b'extratokens.firstGlyph',start,end)
+            (pos, slg) = self.findinDoc(b'extratokens.lastGlyph',start,end)
            if (sfg != None) and (slg != None):
                for glyphnum in range(int(sfg), int(slg)):
                    glyphList.append(glyphnum)
@@ -368,39 +371,39 @@ class DocParser(object):

            (name, argres) = self.lineinDoc(line)

-            if name.endswith('span.firstWord') :
+            if name.endswith(b'span.firstWord') :
                sp_first = int(argres)

-            elif name.endswith('span.lastWord') :
+            elif name.endswith(b'span.lastWord') :
                sp_last = int(argres)

-            elif name.endswith('word.firstGlyph') :
+            elif name.endswith(b'word.firstGlyph') :
                gl_first = int(argres)

-            elif name.endswith('word.lastGlyph') :
+            elif name.endswith(b'word.lastGlyph') :
                gl_last = int(argres)

-            elif name.endswith('word_semantic.firstWord'):
+            elif name.endswith(b'word_semantic.firstWord'):
                ws_first = int(argres)

-            elif name.endswith('word_semantic.lastWord'):
+            elif name.endswith(b'word_semantic.lastWord'):
                ws_last = int(argres)

-            elif name.endswith('word.class'):
+            elif name.endswith(b'word.class'):
                # we only handle spaceafter word class
                try:
-                    (cname, space) = argres.split('-',1)
-                    if space == '' : space = '0'
-                    if (cname == 'spaceafter') and (int(space) > 0) :
+                    (cname, space) = argres.split(b'-',1)
+                    if space == b'' : space = b'0'
+                    if (cname == b'spaceafter') and (int(space) > 0) :
                        word_class = 'sa'
                except:
                    pass

-            elif name.endswith('word.img.src'):
+            elif name.endswith(b'word.img.src'):
                result.append(('img' + word_class, int(argres)))
                word_class = ''

-            elif name.endswith('region.img.src'):
+            elif name.endswith(b'region.img.src'):
                result.append(('img' + word_class, int(argres)))

            if (sp_first != -1) and (sp_last != -1):
@@ -437,7 +440,7 @@ class DocParser(object):

        classres = ''
        if pclass :
-            classres = ' class="' + pclass + '"'
+            classres = ' class="' + pclass.decode('utf-8') + '"'

        br_lb = (regtype == 'fixed') or (regtype == 'chapterheading') or (regtype == 'vertical')

@@ -470,8 +473,8 @@ class DocParser(object):
                    if (link > 0):
                        linktype = self.link_type[link-1]
                        title = self.link_title[link-1]
-                        if (title == "") or (parares.rfind(title) < 0):
-                            title=parares[lstart:]
+                        if (title == b"") or (parares.rfind(title.decode('utf-8')) < 0):
+                            title=parares[lstart:].encode('utf-8')
                        if linktype == 'external' :
                            linkhref = self.link_href[link-1]
                            linkhtml = '<a href="%s">' % linkhref
@@ -482,33 +485,34 @@ class DocParser(object):
                            else :
                                # just link to the current page
                                linkhtml = '<a href="#' + self.id + '">'
-                        linkhtml += title + '</a>'
-                        pos = parares.rfind(title)
+                        linkhtml += title.decode('utf-8')
+                        linkhtml += '</a>'
+                        pos = parares.rfind(title.decode('utf-8'))
                        if pos >= 0:
                            parares = parares[0:pos] + linkhtml + parares[pos+len(title):]
                        else :
                            parares += linkhtml
                        lstart = len(parares)
-                        if word == '_link_' : word = ''
+                        if word == b'_link_' : word = b''
                    elif (link < 0) :
-                        if word == '_link_' : word = ''
+                        if word == b'_link_' : word = b''

-                if word == '_lb_':
+                if word == b'_lb_':
                    if ((num-1) in self.dehyphen_rootid ) or handle_links:
-                        word = ''
+                        word = b''
                        sep = ''
                    elif br_lb :
-                        word = '<br />\n'
+                        word = b'<br />\n'
                        sep = ''
                    else :
-                        word = '\n'
+                        word = b'\n'
                        sep = ''

                if num in self.dehyphen_rootid :
                    word = word[0:-1]
                    sep = ''

-                parares += word + sep
+                parares += word.decode('utf-8') + sep

            elif wtype == 'img' :
                sep = ''
@@ -522,7 +526,9 @@ class DocParser(object):

            elif wtype == 'svg' :
                sep = ''
-                parares += '<img src="img/' + self.id + '_%04d.svg" alt="" />' % num
+                parares += '<img src="img/'
+                parares += self.id
+                parares += '_%04d.svg" alt="" />' % num
                parares += sep

        if len(sep) > 0 : parares = parares[0:-1]
@@ -545,7 +551,7 @@ class DocParser(object):
            (wtype, num) = pdesc[j]

            if wtype == 'ocr' :
-                word = self.ocrtext[num]
+                word = self.ocrtext[num].decode('utf-8')
                sep = ' '

                if handle_links:
@@ -553,7 +559,7 @@ class DocParser(object):
                    if (link > 0):
                        linktype = self.link_type[link-1]
                        title = self.link_title[link-1]
-                        title = title.rstrip('. ')
+                        title = title.rstrip(b'. ')
                        alt_title = parares[lstart:]
                        alt_title = alt_title.strip()
                        # now strip off the actual printed page number
@@ -607,38 +613,38 @@ class DocParser(object):
        hlst = []

        # get the ocr text
-        (pos, argres) = self.findinDoc('info.word.ocrText',0,-1)
-        if argres :  self.ocrtext = argres.split('|')
+        (pos, argres) = self.findinDoc(b'info.word.ocrText',0,-1)
+        if argres :  self.ocrtext = argres.split(b'|')

        # get information to dehyphenate the text
-        self.dehyphen_rootid = self.getData('info.dehyphen.rootID',0,-1)
+        self.dehyphen_rootid = self.getData(b'info.dehyphen.rootID',0,-1)

        # determine if first paragraph is continued from previous page
-        (pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1)
+        (pos, self.parastems_stemid) = self.findinDoc(b'info.paraStems.stemID',0,-1)
        first_para_continued = (self.parastems_stemid  != None)

        # determine if last paragraph is continued onto the next page
-        (pos, self.paracont_stemid) = self.findinDoc('info.paraCont.stemID',0,-1)
+        (pos, self.paracont_stemid) = self.findinDoc(b'info.paraCont.stemID',0,-1)
        last_para_continued = (self.paracont_stemid != None)

        # collect link ids
-        self.link_id = self.getData('info.word.link_id',0,-1)
+        self.link_id = self.getData(b'info.word.link_id',0,-1)

        # collect link destination page numbers
-        self.link_page = self.getData('info.links.page',0,-1)
+        self.link_page = self.getData(b'info.links.page',0,-1)

        # collect link types (container versus external)
-        (pos, argres) = self.findinDoc('info.links.type',0,-1)
-        if argres :  self.link_type = argres.split('|')
+        (pos, argres) = self.findinDoc(b'info.links.type',0,-1)
+        if argres :  self.link_type = argres.split(b'|')

        # collect link destinations
-        (pos, argres) = self.findinDoc('info.links.href',0,-1)
-        if argres :  self.link_href = argres.split('|')
+        (pos, argres) = self.findinDoc(b'info.links.href',0,-1)
+        if argres :  self.link_href = argres.split(b'|')

        # collect link titles
-        (pos, argres) = self.findinDoc('info.links.title',0,-1)
+        (pos, argres) = self.findinDoc(b'info.links.title',0,-1)
        if argres :
-            self.link_title = argres.split('|')
+            self.link_title = argres.split(b'|')
        else:
            self.link_title.append('')

@@ -662,51 +668,51 @@ class DocParser(object):
            # set anchor for link target on this page
            if not anchorSet and not first_para_continued:
                hlst.append('<div style="visibility: hidden; height: 0; width: 0;" id="')
-                hlst.append(self.id + '" title="pagetype_' + pagetype + '"></div>\n')
+                hlst.append(self.id + '" title="pagetype_' + pagetype.decode('utf-8') + '"></div>\n')
                anchorSet = True

            # handle groups of graphics with text captions
-            if (etype == 'grpbeg'):
-                (pos, grptype) = self.findinDoc('group.type', start, end)
+            if (etype == b'grpbeg'):
+                (pos, grptype) = self.findinDoc(b'group.type', start, end)
                if grptype != None:
-                    if grptype == 'graphic':
-                        gcstr = ' class="' + grptype + '"'
+                    if grptype == b'graphic':
+                        gcstr = ' class="' + grptype.decode('utf-8') + '"'
                        hlst.append('<div' + gcstr + '>')
                        inGroup = True

-            elif (etype == 'grpend'):
+            elif (etype == b'grpend'):
                if inGroup:
                    hlst.append('</div>\n')
                    inGroup = False

            else:
-                (pos, regtype) = self.findinDoc('region.type',start,end)
+                (pos, regtype) = self.findinDoc(b'region.type',start,end)

-                if regtype == 'graphic' :
-                    (pos, simgsrc) = self.findinDoc('img.src',start,end)
+                if regtype == b'graphic' :
+                    (pos, simgsrc) = self.findinDoc(b'img.src',start,end)
                    if simgsrc:
                        if inGroup:
                            hlst.append('<img src="img/img%04d.jpg" alt="" />' % int(simgsrc))
                        else:
                            hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc))

-                elif regtype == 'chapterheading' :
+                elif regtype == b'chapterheading' :
                    (pclass, pdesc) = self.getParaDescription(start,end, regtype)
                    if not breakSet:
                        hlst.append('<div style="page-break-after: always;">&nbsp;</div>\n')
                        breakSet = True
                    tag = 'h1'
                    if pclass and (len(pclass) >= 7):
-                        if pclass[3:7] == 'ch1-' : tag = 'h1'
-                        if pclass[3:7] == 'ch2-' : tag = 'h2'
-                        if pclass[3:7] == 'ch3-' : tag = 'h3'
-                        hlst.append('<' + tag + ' class="' + pclass + '">')
+                        if pclass[3:7] == b'ch1-' : tag = 'h1'
+                        if pclass[3:7] == b'ch2-' : tag = 'h2'
+                        if pclass[3:7] == b'ch3-' : tag = 'h3'
+                        hlst.append('<' + tag + ' class="' + pclass.decode('utf-8') + '">')
                    else:
                        hlst.append('<' + tag + '>')
                    hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype))
                    hlst.append('</' + tag + '>')

-                elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') or (regtype == 'listitem'):
+                elif (regtype == b'text') or (regtype == b'fixed') or (regtype == b'insert') or (regtype == b'listitem'):
                    ptype = 'full'
                    # check to see if this is a continution from the previous page
                    if first_para_continued :
@@ -715,16 +721,16 @@ class DocParser(object):
                    (pclass, pdesc) = self.getParaDescription(start,end, regtype)
                    if pclass and (len(pclass) >= 6) and (ptype == 'full'):
                        tag = 'p'
-                        if pclass[3:6] == 'h1-' : tag = 'h4'
-                        if pclass[3:6] == 'h2-' : tag = 'h5'
-                        if pclass[3:6] == 'h3-' : tag = 'h6'
-                        hlst.append('<' + tag + ' class="' + pclass + '">')
+                        if pclass[3:6] == b'h1-' : tag = 'h4'
+                        if pclass[3:6] == b'h2-' : tag = 'h5'
+                        if pclass[3:6] == b'h3-' : tag = 'h6'
+                        hlst.append('<' + tag + ' class="' + pclass.decode('utf-8') + '">')
                        hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype))
                        hlst.append('</' + tag + '>')
                    else :
                        hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))

-                elif (regtype == 'tocentry') :
+                elif (regtype == b'tocentry') :
                    ptype = 'full'
                    if first_para_continued :
                        ptype = 'end'
@@ -733,7 +739,7 @@ class DocParser(object):
                    tocinfo += self.buildTOCEntry(pdesc)
                    hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))

-                elif (regtype == 'vertical') or (regtype == 'table') :
+                elif (regtype == b'vertical') or (regtype == b'table') :
                    ptype = 'full'
                    if inGroup:
                        ptype = 'middle'
@@ -744,19 +750,19 @@ class DocParser(object):
                    hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))


-                elif (regtype == 'synth_fcvr.center'):
-                    (pos, simgsrc) = self.findinDoc('img.src',start,end)
+                elif (regtype == b'synth_fcvr.center'):
+                    (pos, simgsrc) = self.findinDoc(b'img.src',start,end)
                    if simgsrc:
                        hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc))

                else :
                    print('          Making region type', regtype, end=' ')
-                    (pos, temp) = self.findinDoc('paragraph',start,end)
-                    (pos2, temp) = self.findinDoc('span',start,end)
+                    (pos, temp) = self.findinDoc(b'paragraph',start,end)
+                    (pos2, temp) = self.findinDoc(b'span',start,end)
                    if pos != -1 or pos2 != -1:
                        print(' a "text" region')
                        orig_regtype = regtype
-                        regtype = 'fixed'
+                        regtype = b'fixed'
                        ptype = 'full'
                        # check to see if this is a continution from the previous page
                        if first_para_continued :
@@ -764,23 +770,23 @@ class DocParser(object):
                            first_para_continued = False
                        (pclass, pdesc) = self.getParaDescription(start,end, regtype)
                        if not pclass:
-                            if orig_regtype.endswith('.right')     : pclass = 'cl-right'
-                            elif orig_regtype.endswith('.center')  : pclass = 'cl-center'
-                            elif orig_regtype.endswith('.left')    : pclass = 'cl-left'
-                            elif orig_regtype.endswith('.justify') : pclass = 'cl-justify'
+                            if orig_regtype.endswith(b'.right')     : pclass = 'cl-right'
+                            elif orig_regtype.endswith(b'.center')  : pclass = 'cl-center'
+                            elif orig_regtype.endswith(b'.left')    : pclass = 'cl-left'
+                            elif orig_regtype.endswith(b'.justify') : pclass = 'cl-justify'
                        if pclass and (ptype == 'full') and (len(pclass) >= 6):
                            tag = 'p'
-                            if pclass[3:6] == 'h1-' : tag = 'h4'
-                            if pclass[3:6] == 'h2-' : tag = 'h5'
-                            if pclass[3:6] == 'h3-' : tag = 'h6'
-                            hlst.append('<' + tag + ' class="' + pclass + '">')
+                            if pclass[3:6] == b'h1-' : tag = 'h4'
+                            if pclass[3:6] == b'h2-' : tag = 'h5'
+                            if pclass[3:6] == b'h3-' : tag = 'h6'
+                            hlst.append('<' + tag + ' class="' + pclass.decode('utf-8') + '">')
                            hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype))
                            hlst.append('</' + tag + '>')
                        else :
                            hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype))
                    else :
                        print(' a "graphic" region')
-                        (pos, simgsrc) = self.findinDoc('img.src',start,end)
+                        (pos, simgsrc) = self.findinDoc(b'img.src',start,end)
                        if simgsrc:
                            hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc))