topazscripts 1.8

This commit is contained in:
some_updates
2010-01-24 12:19:20 +00:00
committed by Apprentice Alf
parent c93f8e1edd
commit 24f001c61e
12 changed files with 332 additions and 72 deletions

View File

@@ -1,21 +1,27 @@
#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 1.8
from __future__ import with_statement
import csv
import sys
import os
import math
import getopt
from struct import pack
from struct import unpack
class DocParser(object):
def __init__(self, flatxml, classlst, fileid):
def __init__(self, flatxml, classlst, fileid, bookDir):
self.id = os.path.basename(fileid).replace('.dat','')
self.svgcount = 0
self.docList = flatxml.split('\n')
self.docSize = len(self.docList)
self.classList = {}
self.bookDir = bookDir
self.glyphPaths = { }
self.numPaths = 0
tmpList = classlst.split('\n')
for pclass in tmpList:
if pclass != '':
@@ -30,6 +36,107 @@ class DocParser(object):
self.paracont_stemid = []
self.parastems_stemid = []
def getGlyph(self, gid):
result = ''
id='gl%d' % gid
return self.glyphPaths[id]
def glyphs_to_image(self, glyphList):
def extract(path, key):
b = path.find(key) + len(key)
e = path.find(' ',b)
return int(path[b:e])
def extractID(path, key):
b = path.find(key) + len(key)
e = path.find('"',b)
return path[b:e]
svgDir = os.path.join(self.bookDir,'svg')
glyfile = os.path.join(svgDir,'glyphs.svg')
imgDir = os.path.join(self.bookDir,'img')
imgname = self.id + '_%04d.svg' % self.svgcount
imgfile = os.path.join(imgDir,imgname)
# build hash table of glyph paths keyed by glyph id
if self.numPaths == 0:
gfile = open(glyfile, 'r')
while True:
path = gfile.readline()
if (path == ''): break
glyphid = extractID(path,'id="')
self.glyphPaths[glyphid] = path
self.numPaths += 1
gfile.close()
# get glyph information
gxList = self.getData('info.glyph.x',0,-1)
gyList = self.getData('info.glyph.y',0,-1)
gidList = self.getData('info.glyph.glyphID',0,-1)
gids = []
maxws = []
maxhs = []
xs = []
ys = []
gdefs = []
# get path defintions, positions, dimensions for ecah glyph
# that makes up the image, and find min x and min y to reposition origin
minx = -1
miny = -1
for j in glyphList:
gid = gidList[j]
gids.append(gid)
xs.append(gxList[j])
if minx == -1: minx = gxList[j]
else : minx = min(minx, gxList[j])
ys.append(gyList[j])
if miny == -1: miny = gyList[j]
else : miny = min(miny, gyList[j])
path = self.getGlyph(gid)
gdefs.append(path)
maxws.append(extract(path,'width='))
maxhs.append(extract(path,'height='))
# change the origin to minx, miny and calc max height and width
maxw = maxws[0] + xs[0] - minx
maxh = maxhs[0] + ys[0] - miny
for j in xrange(0, len(xs)):
xs[j] = xs[j] - minx
ys[j] = ys[j] - miny
maxw = max( maxw, (maxws[j] + xs[j]) )
maxh = max( maxh, (maxhs[j] + ys[j]) )
# open the image file for output
ifile = open(imgfile,'w')
ifile.write('<?xml version="1.0" standalone="no"?>\n')
ifile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n')
ifile.write('<svg width="%dpx" height="%dpx" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (math.floor(maxw/10), math.floor(maxh/10), maxw, maxh))
ifile.write('<defs>\n')
for j in xrange(0,len(gdefs)):
ifile.write(gdefs[j])
ifile.write('</defs>\n')
for j in xrange(0,len(gids)):
ifile.write('<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (gids[j], xs[j], ys[j]))
ifile.write('</svg>')
ifile.close()
return 0
# return tag at line pos in document
def lineinDoc(self, pos) :
if (pos >= 0) and (pos < self.docSize) :
@@ -77,6 +184,17 @@ class DocParser(object):
return startpos
# returns a vector of integers for the tagpath
def getData(self, tagpath, pos, end):
argres=[]
(foundat, argt) = self.findinDoc(tagpath, pos, end)
if (argt != None) and (len(argt) > 0) :
argList = argt.split('|')
argres = [ int(strval) for strval in argList]
return argres
# build a description of the paragraph
def getParaDescription(self, start, end):
@@ -120,6 +238,7 @@ class DocParser(object):
# this type of paragrph may be made up of multiple _spans, inline
# word monograms (images) and words with semantic meaning
# and now a new type "span" versus the old "_span"
# plus glyphs used to form starting letter of first word
# need to parse this type line by line
line = start + 1
@@ -143,6 +262,21 @@ class DocParser(object):
result.append(('ocr', wordnum))
line += 1
elif name.endswith('word.firstGlyph') :
first = int(argres)
(name, argres) = self.lineinDoc(line+1)
if not name.endswith('word.lastGlyph'):
print 'Error: - incorrect glyph ordering inside word in paragraph'
last = int(argres)
glyphList = []
for glyphnum in xrange(first, last):
glyphList.append(glyphnum)
num = self.svgcount
self.glyphs_to_image(glyphList)
self.svgcount += 1
result.append(('svg', num))
line += 1
elif name.endswith('word.class'):
(cname, space) = argres.split('-',1)
if space == '' : space = '0'
@@ -241,6 +375,11 @@ class DocParser(object):
parares += '<img src="img/img%04d.jpg" alt="" />' % num
parares += sep
elif wtype == 'svg' :
sep = ''
parares += '<img src="img/' + self.id + '_%04d.svg" alt="" />' % num
parares += sep
if len(sep) > 0 : parares = parares[0:-1]
if (type == 'full') or (type == 'end') :
parares += '</p>'
@@ -260,10 +399,7 @@ class DocParser(object):
if argres : self.ocrtext = argres.split('|')
# get information to dehyphenate the text
(pos, argres) = self.findinDoc('info.dehyphen.rootID',0,-1)
if argres:
argList = argres.split('|')
self.dehyphen_rootid = [ int(strval) for strval in argList]
self.dehyphen_rootid = self.getData('info.dehyphen.rootID',0,-1)
# determine if first paragraph is continued from previous page
(pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1)
@@ -274,16 +410,10 @@ class DocParser(object):
last_para_continued = (self.paracont_stemid != None)
# collect link ids
(pos, argres) = self.findinDoc('info.word.link_id',0,-1)
if argres:
argList = argres.split('|')
self.link_id = [ int(strval) for strval in argList]
self.link_id = self.getData('info.word.link_id',0,-1)
# collect link destination page numbers
(pos, argres) = self.findinDoc('info.links.page',0,-1)
if argres :
argList = argres.split('|')
self.link_page = [ int(strval) for strval in argList]
self.link_page = self.getData('info.links.page',0,-1)
# collect link titles
(pos, argres) = self.findinDoc('info.links.title',0,-1)
@@ -382,23 +512,45 @@ class DocParser(object):
elif (regtype == 'table') :
ptype = 'full'
if first_para_continued :
ptype = 'end'
first_para_continued = False
(pclass, pdesc) = self.getParaDescription(start,end)
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
print "Warnings - Table Conversions are notoriously poor"
print "Strongly recommend taking a screen capture image of the "
print "table in %s.svg and using it to replace this attempt at a table" % self.id
# translate first and last word into first and last glyphs
# and generate table as an image and include a link to it
glyphList = []
(pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
(pos, slast) = self.findinDoc('paragraph.lastWord',start,end)
firstglyphList = self.getData('word.firstGlyph',0,-1)
gidList = self.getData('info.glyph.glyphID',0,-1)
if (sfirst != None) and (slast != None) :
first = int(sfirst)
last = int(slast)
firstGlyph = firstglyphList[first]
if last < len(firstglyphList):
lastGlyph = firstglyphList[last]
else :
lastGlyph = len(gidList)
for glyphnum in xrange(firstGlyph, lastGlyph):
glyphList.append(glyphnum)
num = self.svgcount
self.glyphs_to_image(glyphList)
self.svgcount += 1
htmlpage += '<div class="graphic"><img src="img/' + self.id + '_%04d.svg" alt="" /></div>' % num
else :
ptype = 'full'
if first_para_continued :
ptype = 'end'
first_para_continued = False
(pclass, pdesc) = self.getParaDescription(start,end)
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
print " "
print "Warning: - Table Conversions are notoriously poor"
print " Strongly recommend taking a screen capture image of the "
print " table in %s.svg and using it to replace this attempt at a table" % self.id
print " "
elif (regtype == 'synth_fcvr.center') or (regtype == 'synth_text.center'):
(pos, simgsrc) = self.findinDoc('img.src',start,end)
if simgsrc:
htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
else :
print 'Warning: region type', regtype
(pos, temp) = self.findinDoc('paragraph',start,end)
@@ -437,10 +589,10 @@ class DocParser(object):
def convert2HTML(flatxml, classlst, fileid):
def convert2HTML(flatxml, classlst, fileid, bookDir):
# create a document parser
dp = DocParser(flatxml, classlst, fileid)
dp = DocParser(flatxml, classlst, fileid, bookDir)
htmlpage = dp.process()