More fixes for Amazon books, fixing identity checks, started on Topaz.
This commit is contained in:
@@ -56,7 +56,7 @@ def readEncodedNumber(file):
|
||||
c = file.read(1)
|
||||
if (len(c) == 0):
|
||||
return None
|
||||
data = ord(c)
|
||||
data = c[0]
|
||||
datax = (datax <<7) + (data & 0x7F)
|
||||
data = datax
|
||||
|
||||
@@ -188,232 +188,232 @@ class PageParser(object):
|
||||
# tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped)
|
||||
|
||||
token_tags = {
|
||||
'x' : (1, 'scalar_number', 0, 0),
|
||||
'y' : (1, 'scalar_number', 0, 0),
|
||||
'h' : (1, 'scalar_number', 0, 0),
|
||||
'w' : (1, 'scalar_number', 0, 0),
|
||||
'firstWord' : (1, 'scalar_number', 0, 0),
|
||||
'lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'rootID' : (1, 'scalar_number', 0, 0),
|
||||
'stemID' : (1, 'scalar_number', 0, 0),
|
||||
'type' : (1, 'scalar_text', 0, 0),
|
||||
b'x' : (1, 'scalar_number', 0, 0),
|
||||
b'y' : (1, 'scalar_number', 0, 0),
|
||||
b'h' : (1, 'scalar_number', 0, 0),
|
||||
b'w' : (1, 'scalar_number', 0, 0),
|
||||
b'firstWord' : (1, 'scalar_number', 0, 0),
|
||||
b'lastWord' : (1, 'scalar_number', 0, 0),
|
||||
b'rootID' : (1, 'scalar_number', 0, 0),
|
||||
b'stemID' : (1, 'scalar_number', 0, 0),
|
||||
b'type' : (1, 'scalar_text', 0, 0),
|
||||
|
||||
'info' : (0, 'number', 1, 0),
|
||||
b'info' : (0, 'number', 1, 0),
|
||||
|
||||
'info.word' : (0, 'number', 1, 1),
|
||||
'info.word.ocrText' : (1, 'text', 0, 0),
|
||||
'info.word.firstGlyph' : (1, 'raw', 0, 0),
|
||||
'info.word.lastGlyph' : (1, 'raw', 0, 0),
|
||||
'info.word.bl' : (1, 'raw', 0, 0),
|
||||
'info.word.link_id' : (1, 'number', 0, 0),
|
||||
b'info.word' : (0, 'number', 1, 1),
|
||||
b'info.word.ocrText' : (1, 'text', 0, 0),
|
||||
b'info.word.firstGlyph' : (1, 'raw', 0, 0),
|
||||
b'info.word.lastGlyph' : (1, 'raw', 0, 0),
|
||||
b'info.word.bl' : (1, 'raw', 0, 0),
|
||||
b'info.word.link_id' : (1, 'number', 0, 0),
|
||||
|
||||
'glyph' : (0, 'number', 1, 1),
|
||||
'glyph.x' : (1, 'number', 0, 0),
|
||||
'glyph.y' : (1, 'number', 0, 0),
|
||||
'glyph.glyphID' : (1, 'number', 0, 0),
|
||||
b'glyph' : (0, 'number', 1, 1),
|
||||
b'glyph.x' : (1, 'number', 0, 0),
|
||||
b'glyph.y' : (1, 'number', 0, 0),
|
||||
b'glyph.glyphID' : (1, 'number', 0, 0),
|
||||
|
||||
'dehyphen' : (0, 'number', 1, 1),
|
||||
'dehyphen.rootID' : (1, 'number', 0, 0),
|
||||
'dehyphen.stemID' : (1, 'number', 0, 0),
|
||||
'dehyphen.stemPage' : (1, 'number', 0, 0),
|
||||
'dehyphen.sh' : (1, 'number', 0, 0),
|
||||
b'dehyphen' : (0, 'number', 1, 1),
|
||||
b'dehyphen.rootID' : (1, 'number', 0, 0),
|
||||
b'dehyphen.stemID' : (1, 'number', 0, 0),
|
||||
b'dehyphen.stemPage' : (1, 'number', 0, 0),
|
||||
b'dehyphen.sh' : (1, 'number', 0, 0),
|
||||
|
||||
'links' : (0, 'number', 1, 1),
|
||||
'links.page' : (1, 'number', 0, 0),
|
||||
'links.rel' : (1, 'number', 0, 0),
|
||||
'links.row' : (1, 'number', 0, 0),
|
||||
'links.title' : (1, 'text', 0, 0),
|
||||
'links.href' : (1, 'text', 0, 0),
|
||||
'links.type' : (1, 'text', 0, 0),
|
||||
'links.id' : (1, 'number', 0, 0),
|
||||
b'links' : (0, 'number', 1, 1),
|
||||
b'links.page' : (1, 'number', 0, 0),
|
||||
b'links.rel' : (1, 'number', 0, 0),
|
||||
b'links.row' : (1, 'number', 0, 0),
|
||||
b'links.title' : (1, 'text', 0, 0),
|
||||
b'links.href' : (1, 'text', 0, 0),
|
||||
b'links.type' : (1, 'text', 0, 0),
|
||||
b'links.id' : (1, 'number', 0, 0),
|
||||
|
||||
'paraCont' : (0, 'number', 1, 1),
|
||||
'paraCont.rootID' : (1, 'number', 0, 0),
|
||||
'paraCont.stemID' : (1, 'number', 0, 0),
|
||||
'paraCont.stemPage' : (1, 'number', 0, 0),
|
||||
b'paraCont' : (0, 'number', 1, 1),
|
||||
b'paraCont.rootID' : (1, 'number', 0, 0),
|
||||
b'paraCont.stemID' : (1, 'number', 0, 0),
|
||||
b'paraCont.stemPage' : (1, 'number', 0, 0),
|
||||
|
||||
'paraStems' : (0, 'number', 1, 1),
|
||||
'paraStems.stemID' : (1, 'number', 0, 0),
|
||||
b'paraStems' : (0, 'number', 1, 1),
|
||||
b'paraStems.stemID' : (1, 'number', 0, 0),
|
||||
|
||||
'wordStems' : (0, 'number', 1, 1),
|
||||
'wordStems.stemID' : (1, 'number', 0, 0),
|
||||
b'wordStems' : (0, 'number', 1, 1),
|
||||
b'wordStems.stemID' : (1, 'number', 0, 0),
|
||||
|
||||
'empty' : (1, 'snippets', 1, 0),
|
||||
b'empty' : (1, 'snippets', 1, 0),
|
||||
|
||||
'page' : (1, 'snippets', 1, 0),
|
||||
'page.class' : (1, 'scalar_text', 0, 0),
|
||||
'page.pageid' : (1, 'scalar_text', 0, 0),
|
||||
'page.pagelabel' : (1, 'scalar_text', 0, 0),
|
||||
'page.type' : (1, 'scalar_text', 0, 0),
|
||||
'page.h' : (1, 'scalar_number', 0, 0),
|
||||
'page.w' : (1, 'scalar_number', 0, 0),
|
||||
'page.startID' : (1, 'scalar_number', 0, 0),
|
||||
b'page' : (1, 'snippets', 1, 0),
|
||||
b'page.class' : (1, 'scalar_text', 0, 0),
|
||||
b'page.pageid' : (1, 'scalar_text', 0, 0),
|
||||
b'page.pagelabel' : (1, 'scalar_text', 0, 0),
|
||||
b'page.type' : (1, 'scalar_text', 0, 0),
|
||||
b'page.h' : (1, 'scalar_number', 0, 0),
|
||||
b'page.w' : (1, 'scalar_number', 0, 0),
|
||||
b'page.startID' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
'group' : (1, 'snippets', 1, 0),
|
||||
'group.class' : (1, 'scalar_text', 0, 0),
|
||||
'group.type' : (1, 'scalar_text', 0, 0),
|
||||
'group._tag' : (1, 'scalar_text', 0, 0),
|
||||
'group.orientation': (1, 'scalar_text', 0, 0),
|
||||
b'group' : (1, 'snippets', 1, 0),
|
||||
b'group.class' : (1, 'scalar_text', 0, 0),
|
||||
b'group.type' : (1, 'scalar_text', 0, 0),
|
||||
b'group._tag' : (1, 'scalar_text', 0, 0),
|
||||
b'group.orientation': (1, 'scalar_text', 0, 0),
|
||||
|
||||
'region' : (1, 'snippets', 1, 0),
|
||||
'region.class' : (1, 'scalar_text', 0, 0),
|
||||
'region.type' : (1, 'scalar_text', 0, 0),
|
||||
'region.x' : (1, 'scalar_number', 0, 0),
|
||||
'region.y' : (1, 'scalar_number', 0, 0),
|
||||
'region.h' : (1, 'scalar_number', 0, 0),
|
||||
'region.w' : (1, 'scalar_number', 0, 0),
|
||||
'region.orientation' : (1, 'scalar_text', 0, 0),
|
||||
b'region' : (1, 'snippets', 1, 0),
|
||||
b'region.class' : (1, 'scalar_text', 0, 0),
|
||||
b'region.type' : (1, 'scalar_text', 0, 0),
|
||||
b'region.x' : (1, 'scalar_number', 0, 0),
|
||||
b'region.y' : (1, 'scalar_number', 0, 0),
|
||||
b'region.h' : (1, 'scalar_number', 0, 0),
|
||||
b'region.w' : (1, 'scalar_number', 0, 0),
|
||||
b'region.orientation' : (1, 'scalar_text', 0, 0),
|
||||
|
||||
'empty_text_region' : (1, 'snippets', 1, 0),
|
||||
b'empty_text_region' : (1, 'snippets', 1, 0),
|
||||
|
||||
'img' : (1, 'snippets', 1, 0),
|
||||
'img.x' : (1, 'scalar_number', 0, 0),
|
||||
'img.y' : (1, 'scalar_number', 0, 0),
|
||||
'img.h' : (1, 'scalar_number', 0, 0),
|
||||
'img.w' : (1, 'scalar_number', 0, 0),
|
||||
'img.src' : (1, 'scalar_number', 0, 0),
|
||||
'img.color_src' : (1, 'scalar_number', 0, 0),
|
||||
'img.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
'img.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
'img.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
'img.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||
'img.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||
'img.image_type' : (1, 'scalar_number', 0, 0),
|
||||
b'img' : (1, 'snippets', 1, 0),
|
||||
b'img.x' : (1, 'scalar_number', 0, 0),
|
||||
b'img.y' : (1, 'scalar_number', 0, 0),
|
||||
b'img.h' : (1, 'scalar_number', 0, 0),
|
||||
b'img.w' : (1, 'scalar_number', 0, 0),
|
||||
b'img.src' : (1, 'scalar_number', 0, 0),
|
||||
b'img.color_src' : (1, 'scalar_number', 0, 0),
|
||||
b'img.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
b'img.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'img.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'img.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'img.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'img.image_type' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
'paragraph' : (1, 'snippets', 1, 0),
|
||||
'paragraph.class' : (1, 'scalar_text', 0, 0),
|
||||
'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||
'paragraph.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'paragraph' : (1, 'snippets', 1, 0),
|
||||
b'paragraph.class' : (1, 'scalar_text', 0, 0),
|
||||
b'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
b'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
b'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
b'paragraph.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
b'paragraph.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'paragraph.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'paragraph.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
|
||||
'word_semantic' : (1, 'snippets', 1, 1),
|
||||
'word_semantic.type' : (1, 'scalar_text', 0, 0),
|
||||
'word_semantic.class' : (1, 'scalar_text', 0, 0),
|
||||
'word_semantic.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
'word_semantic.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'word_semantic.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
'word_semantic.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
'word_semantic.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||
'word_semantic.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'word_semantic' : (1, 'snippets', 1, 1),
|
||||
b'word_semantic.type' : (1, 'scalar_text', 0, 0),
|
||||
b'word_semantic.class' : (1, 'scalar_text', 0, 0),
|
||||
b'word_semantic.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
b'word_semantic.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
b'word_semantic.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'word_semantic.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'word_semantic.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'word_semantic.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
'word' : (1, 'snippets', 1, 0),
|
||||
'word.type' : (1, 'scalar_text', 0, 0),
|
||||
'word.class' : (1, 'scalar_text', 0, 0),
|
||||
'word.firstGlyph' : (1, 'scalar_number', 0, 0),
|
||||
'word.lastGlyph' : (1, 'scalar_number', 0, 0),
|
||||
b'word' : (1, 'snippets', 1, 0),
|
||||
b'word.type' : (1, 'scalar_text', 0, 0),
|
||||
b'word.class' : (1, 'scalar_text', 0, 0),
|
||||
b'word.firstGlyph' : (1, 'scalar_number', 0, 0),
|
||||
b'word.lastGlyph' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
'_span' : (1, 'snippets', 1, 0),
|
||||
'_span.class' : (1, 'scalar_text', 0, 0),
|
||||
'_span.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
'_span.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'_span.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
'_span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
'_span.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
'_span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||
'_span.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'_span' : (1, 'snippets', 1, 0),
|
||||
b'_span.class' : (1, 'scalar_text', 0, 0),
|
||||
b'_span.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
b'_span.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
b'_span.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
b'_span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'_span.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'_span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'_span.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
'span' : (1, 'snippets', 1, 0),
|
||||
'span.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
'span.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
'span.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
'span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
'span.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
'span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||
'span.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'span' : (1, 'snippets', 1, 0),
|
||||
b'span.firstWord' : (1, 'scalar_number', 0, 0),
|
||||
b'span.lastWord' : (1, 'scalar_number', 0, 0),
|
||||
b'span.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
b'span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'span.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'span.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
'extratokens' : (1, 'snippets', 1, 0),
|
||||
'extratokens.class' : (1, 'scalar_text', 0, 0),
|
||||
'extratokens.type' : (1, 'scalar_text', 0, 0),
|
||||
'extratokens.firstGlyph' : (1, 'scalar_number', 0, 0),
|
||||
'extratokens.lastGlyph' : (1, 'scalar_number', 0, 0),
|
||||
'extratokens.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
'extratokens.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
'extratokens.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
'extratokens.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||
'extratokens.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'extratokens' : (1, 'snippets', 1, 0),
|
||||
b'extratokens.class' : (1, 'scalar_text', 0, 0),
|
||||
b'extratokens.type' : (1, 'scalar_text', 0, 0),
|
||||
b'extratokens.firstGlyph' : (1, 'scalar_number', 0, 0),
|
||||
b'extratokens.lastGlyph' : (1, 'scalar_number', 0, 0),
|
||||
b'extratokens.gridSize' : (1, 'scalar_number', 0, 0),
|
||||
b'extratokens.gridBottomCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'extratokens.gridTopCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'extratokens.gridBeginCenter' : (1, 'scalar_number', 0, 0),
|
||||
b'extratokens.gridEndCenter' : (1, 'scalar_number', 0, 0),
|
||||
|
||||
'glyph.h' : (1, 'number', 0, 0),
|
||||
'glyph.w' : (1, 'number', 0, 0),
|
||||
'glyph.use' : (1, 'number', 0, 0),
|
||||
'glyph.vtx' : (1, 'number', 0, 1),
|
||||
'glyph.len' : (1, 'number', 0, 1),
|
||||
'glyph.dpi' : (1, 'number', 0, 0),
|
||||
'vtx' : (0, 'number', 1, 1),
|
||||
'vtx.x' : (1, 'number', 0, 0),
|
||||
'vtx.y' : (1, 'number', 0, 0),
|
||||
'len' : (0, 'number', 1, 1),
|
||||
'len.n' : (1, 'number', 0, 0),
|
||||
b'glyph.h' : (1, 'number', 0, 0),
|
||||
b'glyph.w' : (1, 'number', 0, 0),
|
||||
b'glyph.use' : (1, 'number', 0, 0),
|
||||
b'glyph.vtx' : (1, 'number', 0, 1),
|
||||
b'glyph.len' : (1, 'number', 0, 1),
|
||||
b'glyph.dpi' : (1, 'number', 0, 0),
|
||||
b'vtx' : (0, 'number', 1, 1),
|
||||
b'vtx.x' : (1, 'number', 0, 0),
|
||||
b'vtx.y' : (1, 'number', 0, 0),
|
||||
b'len' : (0, 'number', 1, 1),
|
||||
b'len.n' : (1, 'number', 0, 0),
|
||||
|
||||
'book' : (1, 'snippets', 1, 0),
|
||||
'version' : (1, 'snippets', 1, 0),
|
||||
'version.FlowEdit_1_id' : (1, 'scalar_text', 0, 0),
|
||||
'version.FlowEdit_1_version' : (1, 'scalar_text', 0, 0),
|
||||
'version.Schema_id' : (1, 'scalar_text', 0, 0),
|
||||
'version.Schema_version' : (1, 'scalar_text', 0, 0),
|
||||
'version.Topaz_version' : (1, 'scalar_text', 0, 0),
|
||||
'version.WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0),
|
||||
'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0),
|
||||
'version.ZoneEdit_1_id' : (1, 'scalar_text', 0, 0),
|
||||
'version.ZoneEdit_1_version' : (1, 'scalar_text', 0, 0),
|
||||
'version.chapterheaders' : (1, 'scalar_text', 0, 0),
|
||||
'version.creation_date' : (1, 'scalar_text', 0, 0),
|
||||
'version.header_footer' : (1, 'scalar_text', 0, 0),
|
||||
'version.init_from_ocr' : (1, 'scalar_text', 0, 0),
|
||||
'version.letter_insertion' : (1, 'scalar_text', 0, 0),
|
||||
'version.xmlinj_convert' : (1, 'scalar_text', 0, 0),
|
||||
'version.xmlinj_reflow' : (1, 'scalar_text', 0, 0),
|
||||
'version.xmlinj_transform' : (1, 'scalar_text', 0, 0),
|
||||
'version.findlists' : (1, 'scalar_text', 0, 0),
|
||||
'version.page_num' : (1, 'scalar_text', 0, 0),
|
||||
'version.page_type' : (1, 'scalar_text', 0, 0),
|
||||
'version.bad_text' : (1, 'scalar_text', 0, 0),
|
||||
'version.glyph_mismatch' : (1, 'scalar_text', 0, 0),
|
||||
'version.margins' : (1, 'scalar_text', 0, 0),
|
||||
'version.staggered_lines' : (1, 'scalar_text', 0, 0),
|
||||
'version.paragraph_continuation' : (1, 'scalar_text', 0, 0),
|
||||
'version.toc' : (1, 'scalar_text', 0, 0),
|
||||
b'book' : (1, 'snippets', 1, 0),
|
||||
b'version' : (1, 'snippets', 1, 0),
|
||||
b'version.FlowEdit_1_id' : (1, 'scalar_text', 0, 0),
|
||||
b'version.FlowEdit_1_version' : (1, 'scalar_text', 0, 0),
|
||||
b'version.Schema_id' : (1, 'scalar_text', 0, 0),
|
||||
b'version.Schema_version' : (1, 'scalar_text', 0, 0),
|
||||
b'version.Topaz_version' : (1, 'scalar_text', 0, 0),
|
||||
b'version.WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0),
|
||||
b'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0),
|
||||
b'version.ZoneEdit_1_id' : (1, 'scalar_text', 0, 0),
|
||||
b'version.ZoneEdit_1_version' : (1, 'scalar_text', 0, 0),
|
||||
b'version.chapterheaders' : (1, 'scalar_text', 0, 0),
|
||||
b'version.creation_date' : (1, 'scalar_text', 0, 0),
|
||||
b'version.header_footer' : (1, 'scalar_text', 0, 0),
|
||||
b'version.init_from_ocr' : (1, 'scalar_text', 0, 0),
|
||||
b'version.letter_insertion' : (1, 'scalar_text', 0, 0),
|
||||
b'version.xmlinj_convert' : (1, 'scalar_text', 0, 0),
|
||||
b'version.xmlinj_reflow' : (1, 'scalar_text', 0, 0),
|
||||
b'version.xmlinj_transform' : (1, 'scalar_text', 0, 0),
|
||||
b'version.findlists' : (1, 'scalar_text', 0, 0),
|
||||
b'version.page_num' : (1, 'scalar_text', 0, 0),
|
||||
b'version.page_type' : (1, 'scalar_text', 0, 0),
|
||||
b'version.bad_text' : (1, 'scalar_text', 0, 0),
|
||||
b'version.glyph_mismatch' : (1, 'scalar_text', 0, 0),
|
||||
b'version.margins' : (1, 'scalar_text', 0, 0),
|
||||
b'version.staggered_lines' : (1, 'scalar_text', 0, 0),
|
||||
b'version.paragraph_continuation' : (1, 'scalar_text', 0, 0),
|
||||
b'version.toc' : (1, 'scalar_text', 0, 0),
|
||||
|
||||
'stylesheet' : (1, 'snippets', 1, 0),
|
||||
'style' : (1, 'snippets', 1, 0),
|
||||
'style._tag' : (1, 'scalar_text', 0, 0),
|
||||
'style.type' : (1, 'scalar_text', 0, 0),
|
||||
'style._after_type' : (1, 'scalar_text', 0, 0),
|
||||
'style._parent_type' : (1, 'scalar_text', 0, 0),
|
||||
'style._after_parent_type' : (1, 'scalar_text', 0, 0),
|
||||
'style.class' : (1, 'scalar_text', 0, 0),
|
||||
'style._after_class' : (1, 'scalar_text', 0, 0),
|
||||
'rule' : (1, 'snippets', 1, 0),
|
||||
'rule.attr' : (1, 'scalar_text', 0, 0),
|
||||
'rule.value' : (1, 'scalar_text', 0, 0),
|
||||
b'stylesheet' : (1, 'snippets', 1, 0),
|
||||
b'style' : (1, 'snippets', 1, 0),
|
||||
b'style._tag' : (1, 'scalar_text', 0, 0),
|
||||
b'style.type' : (1, 'scalar_text', 0, 0),
|
||||
b'style._after_type' : (1, 'scalar_text', 0, 0),
|
||||
b'style._parent_type' : (1, 'scalar_text', 0, 0),
|
||||
b'style._after_parent_type' : (1, 'scalar_text', 0, 0),
|
||||
b'style.class' : (1, 'scalar_text', 0, 0),
|
||||
b'style._after_class' : (1, 'scalar_text', 0, 0),
|
||||
b'rule' : (1, 'snippets', 1, 0),
|
||||
b'rule.attr' : (1, 'scalar_text', 0, 0),
|
||||
b'rule.value' : (1, 'scalar_text', 0, 0),
|
||||
|
||||
'original' : (0, 'number', 1, 1),
|
||||
'original.pnum' : (1, 'number', 0, 0),
|
||||
'original.pid' : (1, 'text', 0, 0),
|
||||
'pages' : (0, 'number', 1, 1),
|
||||
'pages.ref' : (1, 'number', 0, 0),
|
||||
'pages.id' : (1, 'number', 0, 0),
|
||||
'startID' : (0, 'number', 1, 1),
|
||||
'startID.page' : (1, 'number', 0, 0),
|
||||
'startID.id' : (1, 'number', 0, 0),
|
||||
b'original' : (0, 'number', 1, 1),
|
||||
b'original.pnum' : (1, 'number', 0, 0),
|
||||
b'original.pid' : (1, 'text', 0, 0),
|
||||
b'pages' : (0, 'number', 1, 1),
|
||||
b'pages.ref' : (1, 'number', 0, 0),
|
||||
b'pages.id' : (1, 'number', 0, 0),
|
||||
b'startID' : (0, 'number', 1, 1),
|
||||
b'startID.page' : (1, 'number', 0, 0),
|
||||
b'startID.id' : (1, 'number', 0, 0),
|
||||
|
||||
'median_d' : (1, 'number', 0, 0),
|
||||
'median_h' : (1, 'number', 0, 0),
|
||||
'median_firsty' : (1, 'number', 0, 0),
|
||||
'median_lasty' : (1, 'number', 0, 0),
|
||||
b'median_d' : (1, 'number', 0, 0),
|
||||
b'median_h' : (1, 'number', 0, 0),
|
||||
b'median_firsty' : (1, 'number', 0, 0),
|
||||
b'median_lasty' : (1, 'number', 0, 0),
|
||||
|
||||
'num_footers_maybe' : (1, 'number', 0, 0),
|
||||
'num_footers_yes' : (1, 'number', 0, 0),
|
||||
'num_headers_maybe' : (1, 'number', 0, 0),
|
||||
'num_headers_yes' : (1, 'number', 0, 0),
|
||||
b'num_footers_maybe' : (1, 'number', 0, 0),
|
||||
b'num_footers_yes' : (1, 'number', 0, 0),
|
||||
b'num_headers_maybe' : (1, 'number', 0, 0),
|
||||
b'num_headers_yes' : (1, 'number', 0, 0),
|
||||
|
||||
'tracking' : (1, 'number', 0, 0),
|
||||
'src' : (1, 'text', 0, 0),
|
||||
b'tracking' : (1, 'number', 0, 0),
|
||||
b'src' : (1, 'text', 0, 0),
|
||||
|
||||
}
|
||||
|
||||
@@ -430,7 +430,7 @@ class PageParser(object):
|
||||
cnt = len(self.tagpath)
|
||||
if i < cnt : result = self.tagpath[i]
|
||||
for j in range(i+1, cnt) :
|
||||
result += '.' + self.tagpath[j]
|
||||
result += b'.' + self.tagpath[j]
|
||||
return result
|
||||
|
||||
|
||||
@@ -505,7 +505,7 @@ class PageParser(object):
|
||||
|
||||
if (subtags == 1):
|
||||
ntags = readEncodedNumber(self.fo)
|
||||
if self.debug : print('subtags: ' + token + ' has ' + str(ntags))
|
||||
if self.debug : print('subtags: ', token , ' has ' , str(ntags))
|
||||
for j in range(ntags):
|
||||
val = readEncodedNumber(self.fo)
|
||||
subtagres.append(self.procToken(self.dict.lookup(val)))
|
||||
@@ -613,7 +613,7 @@ class PageParser(object):
|
||||
subtagList = tag[1]
|
||||
argtype = tag[2]
|
||||
argList = tag[3]
|
||||
nname = prefix + '.' + name
|
||||
nname = prefix + b'.' + name
|
||||
nsubtaglist = []
|
||||
for j in subtagList:
|
||||
nsubtaglist.append(self.updateName(j,prefix))
|
||||
@@ -662,34 +662,34 @@ class PageParser(object):
|
||||
subtagList = node[1]
|
||||
argtype = node[2]
|
||||
argList = node[3]
|
||||
fullpathname = name.split('.')
|
||||
fullpathname = name.split(b'.')
|
||||
nodename = fullpathname.pop()
|
||||
ilvl = len(fullpathname)
|
||||
indent = ' ' * (3 * ilvl)
|
||||
indent = b' ' * (3 * ilvl)
|
||||
rlst = []
|
||||
rlst.append(indent + '<' + nodename + '>')
|
||||
rlst.append(indent + b'<' + nodename + b'>')
|
||||
if len(argList) > 0:
|
||||
alst = []
|
||||
for j in argList:
|
||||
if (argtype == 'text') or (argtype == 'scalar_text') :
|
||||
alst.append(j + '|')
|
||||
if (argtype == b'text') or (argtype == b'scalar_text') :
|
||||
alst.append(j + b'|')
|
||||
else :
|
||||
alst.append(str(j) + ',')
|
||||
argres = "".join(alst)
|
||||
alst.append(str(j).encode('utf-8') + b',')
|
||||
argres = b"".join(alst)
|
||||
argres = argres[0:-1]
|
||||
if argtype == 'snippets' :
|
||||
rlst.append('snippets:' + argres)
|
||||
if argtype == b'snippets' :
|
||||
rlst.append(b'snippets:' + argres)
|
||||
else :
|
||||
rlst.append(argres)
|
||||
if len(subtagList) > 0 :
|
||||
rlst.append('\n')
|
||||
rlst.append(b'\n')
|
||||
for j in subtagList:
|
||||
if len(j) > 0 :
|
||||
rlst.append(self.formatTag(j))
|
||||
rlst.append(indent + '</' + nodename + '>\n')
|
||||
rlst.append(indent + b'</' + nodename + b'>\n')
|
||||
else:
|
||||
rlst.append('</' + nodename + '>\n')
|
||||
return "".join(rlst)
|
||||
rlst.append(b'</' + nodename + b'>\n')
|
||||
return b"".join(rlst)
|
||||
|
||||
|
||||
# flatten tag
|
||||
@@ -704,20 +704,20 @@ class PageParser(object):
|
||||
alst = []
|
||||
for j in argList:
|
||||
if (argtype == 'text') or (argtype == 'scalar_text') :
|
||||
alst.append(j + '|')
|
||||
alst.append(j + b'|')
|
||||
else :
|
||||
alst.append(str(j) + '|')
|
||||
argres = "".join(alst)
|
||||
alst.append(str(j).encode('utf-8') + b'|')
|
||||
argres = b"".join(alst)
|
||||
argres = argres[0:-1]
|
||||
if argtype == 'snippets' :
|
||||
rlst.append('.snippets=' + argres)
|
||||
if argtype == b'snippets' :
|
||||
rlst.append(b'.snippets=' + argres)
|
||||
else :
|
||||
rlst.append('=' + argres)
|
||||
rlst.append('\n')
|
||||
rlst.append(b'=' + argres)
|
||||
rlst.append(b'\n')
|
||||
for j in subtagList:
|
||||
if len(j) > 0 :
|
||||
rlst.append(self.flattenTag(j))
|
||||
return "".join(rlst)
|
||||
return b"".join(rlst)
|
||||
|
||||
|
||||
# reduce create xml output
|
||||
@@ -729,7 +729,7 @@ class PageParser(object):
|
||||
rlst.append(self.flattenTag(j))
|
||||
else:
|
||||
rlst.append(self.formatTag(j))
|
||||
result = "".join(rlst)
|
||||
result = b"".join(rlst)
|
||||
if self.debug : print(result)
|
||||
return result
|
||||
|
||||
@@ -747,16 +747,16 @@ class PageParser(object):
|
||||
|
||||
# peek at the first bytes to see what type of file it is
|
||||
magic = self.fo.read(9)
|
||||
if (magic[0:1] == 'p') and (magic[2:9] == 'marker_'):
|
||||
first_token = 'info'
|
||||
elif (magic[0:1] == 'p') and (magic[2:9] == '__PAGE_'):
|
||||
if (magic[0:1] == b'p') and (magic[2:9] == b'marker_'):
|
||||
first_token = b'info'
|
||||
elif (magic[0:1] == b'p') and (magic[2:9] == b'__PAGE_'):
|
||||
skip = self.fo.read(2)
|
||||
first_token = 'info'
|
||||
elif (magic[0:1] == 'p') and (magic[2:8] == '_PAGE_'):
|
||||
first_token = 'info'
|
||||
elif (magic[0:1] == 'g') and (magic[2:9] == '__GLYPH'):
|
||||
first_token = b'info'
|
||||
elif (magic[0:1] == b'p') and (magic[2:8] == b'_PAGE_'):
|
||||
first_token = b'info'
|
||||
elif (magic[0:1] == b'g') and (magic[2:9] == b'__GLYPH'):
|
||||
skip = self.fo.read(3)
|
||||
first_token = 'info'
|
||||
first_token = b'info'
|
||||
else :
|
||||
# other0.dat file
|
||||
first_token = None
|
||||
@@ -778,7 +778,7 @@ class PageParser(object):
|
||||
break
|
||||
|
||||
if (v == 0x72):
|
||||
self.doLoop72('number')
|
||||
self.doLoop72(b'number')
|
||||
elif (v > 0) and (v < self.dict.getSize()) :
|
||||
tag = self.procToken(self.dict.lookup(v))
|
||||
if len(tag) > 0 :
|
||||
@@ -789,7 +789,7 @@ class PageParser(object):
|
||||
if (v == 0):
|
||||
if (self.peek(1) == 0x5f):
|
||||
skip = self.fo.read(1)
|
||||
first_token = 'info'
|
||||
first_token = b'info'
|
||||
|
||||
# now do snippet injection
|
||||
if len(self.snippetList) > 0 :
|
||||
@@ -809,14 +809,14 @@ class PageParser(object):
|
||||
|
||||
def fromData(dict, fname):
|
||||
flat_xml = True
|
||||
debug = False
|
||||
debug = True
|
||||
pp = PageParser(fname, dict, debug, flat_xml)
|
||||
xmlpage = pp.process()
|
||||
return xmlpage
|
||||
|
||||
def getXML(dict, fname):
|
||||
flat_xml = False
|
||||
debug = False
|
||||
debug = True
|
||||
pp = PageParser(fname, dict, debug, flat_xml)
|
||||
xmlpage = pp.process()
|
||||
return xmlpage
|
||||
@@ -845,7 +845,7 @@ def main(argv):
|
||||
sys.stderr=SafeUnbuffered(sys.stderr)
|
||||
dictFile = ""
|
||||
pageFile = ""
|
||||
debug = False
|
||||
debug = True
|
||||
flat_xml = False
|
||||
printOutput = False
|
||||
if len(argv) == 0:
|
||||
|
||||
Reference in New Issue
Block a user