topazscripts 1.8

2010-01-24 12:19:20 +00:00
parent c93f8e1edd
commit 24f001c61e
12 changed files with 332 additions and 72 deletions
--- a/Topaz_Tools/lib/flatxml2html.py
+++ b/Topaz_Tools/lib/flatxml2html.py
@@ -1,21 +1,27 @@
 #! /usr/bin/python
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+# For use with Topaz Scripts Version 1.8                                                                                                  

 from __future__ import with_statement
 import csv
 import sys
 import os
+import math
 import getopt
 from struct import pack
 from struct import unpack


 class DocParser(object):
-    def __init__(self, flatxml, classlst, fileid):
+    def __init__(self, flatxml, classlst, fileid, bookDir):
        self.id = os.path.basename(fileid).replace('.dat','')
+        self.svgcount = 0
        self.docList = flatxml.split('\n')
        self.docSize = len(self.docList)
        self.classList = {}
+        self.bookDir = bookDir
+        self.glyphPaths = { }
+        self.numPaths = 0
        tmpList = classlst.split('\n')
        for pclass in tmpList:
            if pclass != '':
@@ -30,6 +36,107 @@ class DocParser(object):
        self.paracont_stemid = []
        self.parastems_stemid = []

+
+    def getGlyph(self, gid):
+        result = ''
+        id='gl%d' % gid
+        return self.glyphPaths[id]
+
+
+    def glyphs_to_image(self, glyphList):
+
+        def extract(path, key):
+            b = path.find(key) + len(key)
+            e = path.find(' ',b)
+            return int(path[b:e])
+
+        def extractID(path, key):
+            b = path.find(key) + len(key)
+            e = path.find('"',b)
+            return path[b:e]
+            
+
+        svgDir = os.path.join(self.bookDir,'svg')
+        glyfile = os.path.join(svgDir,'glyphs.svg')
+
+        imgDir = os.path.join(self.bookDir,'img')
+        imgname = self.id + '_%04d.svg' % self.svgcount
+        imgfile = os.path.join(imgDir,imgname)
+
+        # build hash table of glyph paths keyed by glyph id
+        if self.numPaths == 0:
+            gfile = open(glyfile, 'r')
+            while True:
+                path = gfile.readline()
+                if (path == ''): break
+                glyphid = extractID(path,'id="')
+                self.glyphPaths[glyphid] = path
+                self.numPaths += 1
+            gfile.close()
+
+
+        # get glyph information
+        gxList = self.getData('info.glyph.x',0,-1)
+        gyList = self.getData('info.glyph.y',0,-1)
+        gidList = self.getData('info.glyph.glyphID',0,-1)
+
+        gids = []
+        maxws = []
+        maxhs = []
+        xs = []
+        ys = []
+        gdefs = []
+
+        # get path defintions, positions, dimensions for ecah glyph 
+        # that makes up the image, and find min x and min y to reposition origin
+        minx = -1
+        miny = -1
+        for j in glyphList:
+            gid = gidList[j]
+            gids.append(gid)
+
+            xs.append(gxList[j])
+            if minx == -1: minx = gxList[j]
+            else : minx = min(minx, gxList[j])
+ 
+            ys.append(gyList[j])
+            if miny == -1: miny = gyList[j]
+            else : miny = min(miny, gyList[j])
+
+            path = self.getGlyph(gid)
+            gdefs.append(path)
+
+            maxws.append(extract(path,'width='))
+            maxhs.append(extract(path,'height='))
+
+
+        # change the origin to minx, miny and calc max height and width
+        maxw = maxws[0] + xs[0] - minx
+        maxh = maxhs[0] + ys[0] - miny
+        for j in xrange(0, len(xs)):
+            xs[j] = xs[j] - minx
+            ys[j] = ys[j] - miny
+            maxw = max( maxw, (maxws[j] + xs[j]) )
+            maxh = max( maxh, (maxhs[j] + ys[j]) )
+
+        # open the image file for output
+        ifile = open(imgfile,'w')
+        ifile.write('<?xml version="1.0" standalone="no"?>\n')
+        ifile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n')
+        ifile.write('<svg width="%dpx" height="%dpx" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (math.floor(maxw/10), math.floor(maxh/10), maxw, maxh))
+        ifile.write('<defs>\n')
+        for j in xrange(0,len(gdefs)):
+            ifile.write(gdefs[j])
+        ifile.write('</defs>\n')
+        for j in xrange(0,len(gids)):
+            ifile.write('<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (gids[j], xs[j], ys[j]))
+        ifile.write('</svg>')
+        ifile.close()
+
+        return 0
+
+
+
    # return tag at line pos in document
    def lineinDoc(self, pos) :
        if (pos >= 0) and (pos < self.docSize) :
@@ -77,6 +184,17 @@ class DocParser(object):
        return startpos


+    # returns a vector of integers for the tagpath
+    def getData(self, tagpath, pos, end):
+        argres=[]
+        (foundat, argt) = self.findinDoc(tagpath, pos, end)
+        if (argt != None) and (len(argt) > 0) :
+            argList = argt.split('|')
+            argres = [ int(strval) for strval in argList]
+        return argres
+
+
+
    # build a description of the paragraph
    def getParaDescription(self, start, end):

@@ -120,6 +238,7 @@ class DocParser(object):
        # this type of paragrph may be made up of multiple _spans, inline 
        # word monograms (images) and words with semantic meaning
        # and now a new type "span" versus the old "_span"
+        # plus glyphs used to form starting letter of first word
        
        # need to parse this type line by line
        line = start + 1
@@ -143,6 +262,21 @@ class DocParser(object):
                    result.append(('ocr', wordnum))
                line += 1

+            elif name.endswith('word.firstGlyph') :
+                first = int(argres)
+                (name, argres) = self.lineinDoc(line+1)
+                if not name.endswith('word.lastGlyph'):
+                    print 'Error: - incorrect glyph ordering inside word in paragraph'
+                last = int(argres)
+                glyphList = []
+                for glyphnum in xrange(first, last):
+                    glyphList.append(glyphnum)
+                num = self.svgcount
+                self.glyphs_to_image(glyphList)
+                self.svgcount += 1
+                result.append(('svg', num))
+                line += 1
+
            elif name.endswith('word.class'):
               (cname, space) = argres.split('-',1)
               if space == '' : space = '0'
@@ -241,6 +375,11 @@ class DocParser(object):
                parares += '<img src="img/img%04d.jpg" alt="" />' % num
                parares += sep

+            elif wtype == 'svg' :
+                sep = ''
+                parares += '<img src="img/' + self.id + '_%04d.svg" alt="" />' % num 
+                parares += sep
+
        if len(sep) > 0 : parares = parares[0:-1]
        if (type == 'full') or (type == 'end') :
            parares += '</p>'
@@ -260,10 +399,7 @@ class DocParser(object):
        if argres :  self.ocrtext = argres.split('|')

        # get information to dehyphenate the text
-        (pos, argres) = self.findinDoc('info.dehyphen.rootID',0,-1)
-        if argres: 
-            argList = argres.split('|')
-            self.dehyphen_rootid = [ int(strval) for strval in argList]
+        self.dehyphen_rootid = self.getData('info.dehyphen.rootID',0,-1)

        # determine if first paragraph is continued from previous page
        (pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1)
@@ -274,16 +410,10 @@ class DocParser(object):
        last_para_continued = (self.paracont_stemid != None)

        # collect link ids
-        (pos, argres) = self.findinDoc('info.word.link_id',0,-1)
-        if argres:
-            argList = argres.split('|')
-            self.link_id = [ int(strval) for strval in argList]
+        self.link_id = self.getData('info.word.link_id',0,-1)

        # collect link destination page numbers
-        (pos, argres) = self.findinDoc('info.links.page',0,-1)
-        if argres :
-            argList = argres.split('|')
-            self.link_page = [ int(strval) for strval in argList]
+        self.link_page = self.getData('info.links.page',0,-1)

        # collect link titles
        (pos, argres) = self.findinDoc('info.links.title',0,-1)
@@ -382,23 +512,45 @@ class DocParser(object):


            elif (regtype == 'table') :
-                ptype = 'full'
-                if first_para_continued :
-                    ptype = 'end'
-                    first_para_continued = False
-                (pclass, pdesc) = self.getParaDescription(start,end)
-                htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
-                print "Warnings - Table Conversions are notoriously poor"
-                print "Strongly recommend taking a screen capture image of the "
-                print "table in %s.svg and using it to replace this attempt at a table" % self.id
-
+                # translate first and last word into first and last glyphs
+                # and generate table as an image and include a link to it
+                glyphList = []
+                (pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
+                (pos, slast) = self.findinDoc('paragraph.lastWord',start,end)
+                firstglyphList = self.getData('word.firstGlyph',0,-1)
+                gidList = self.getData('info.glyph.glyphID',0,-1)
+                if (sfirst != None) and (slast != None) :
+                    first = int(sfirst)
+                    last = int(slast)
+                    firstGlyph = firstglyphList[first]
+                    if last < len(firstglyphList):
+                        lastGlyph = firstglyphList[last]
+                    else :
+                        lastGlyph = len(gidList)
+                    for glyphnum in xrange(firstGlyph, lastGlyph):
+                        glyphList.append(glyphnum)
+                    num = self.svgcount
+                    self.glyphs_to_image(glyphList)
+                    self.svgcount += 1
+                    htmlpage += '<div class="graphic"><img src="img/' + self.id + '_%04d.svg" alt="" /></div>' % num
+                else :
+                    ptype = 'full'
+                    if first_para_continued :
+                        ptype = 'end'
+                        first_para_continued = False
+                        (pclass, pdesc) = self.getParaDescription(start,end)
+                        htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
+                        print " "
+                        print "Warning: - Table Conversions are notoriously poor"
+                        print "    Strongly recommend taking a screen capture image of the "
+                        print "    table in %s.svg and using it to replace this attempt at a table" % self.id
+                        print " "

            elif (regtype == 'synth_fcvr.center') or (regtype == 'synth_text.center'):
                (pos, simgsrc) = self.findinDoc('img.src',start,end)
                if simgsrc:
                    htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)

-
            else :
                print 'Warning: region type', regtype
                (pos, temp) = self.findinDoc('paragraph',start,end)
@@ -437,10 +589,10 @@ class DocParser(object):



-def convert2HTML(flatxml, classlst, fileid):
+def convert2HTML(flatxml, classlst, fileid, bookDir):

    # create a document parser
-    dp = DocParser(flatxml, classlst, fileid)
+    dp = DocParser(flatxml, classlst, fileid, bookDir)

    htmlpage = dp.process()