More fixes for Amazon books, fixing identity checks, started on Topaz.

This commit is contained in:
Apprentice Harper
2020-10-16 13:58:59 +01:00
parent dc27c36761
commit 939cdbb0c9
8 changed files with 530 additions and 512 deletions

View File

@@ -56,7 +56,7 @@ def readEncodedNumber(file):
c = file.read(1)
if (len(c) == 0):
return None
data = ord(c)
data = c[0]
datax = (datax <<7) + (data & 0x7F)
data = datax
@@ -188,232 +188,232 @@ class PageParser(object):
# tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped)
token_tags = {
'x' : (1, 'scalar_number', 0, 0),
'y' : (1, 'scalar_number', 0, 0),
'h' : (1, 'scalar_number', 0, 0),
'w' : (1, 'scalar_number', 0, 0),
'firstWord' : (1, 'scalar_number', 0, 0),
'lastWord' : (1, 'scalar_number', 0, 0),
'rootID' : (1, 'scalar_number', 0, 0),
'stemID' : (1, 'scalar_number', 0, 0),
'type' : (1, 'scalar_text', 0, 0),
b'x' : (1, 'scalar_number', 0, 0),
b'y' : (1, 'scalar_number', 0, 0),
b'h' : (1, 'scalar_number', 0, 0),
b'w' : (1, 'scalar_number', 0, 0),
b'firstWord' : (1, 'scalar_number', 0, 0),
b'lastWord' : (1, 'scalar_number', 0, 0),
b'rootID' : (1, 'scalar_number', 0, 0),
b'stemID' : (1, 'scalar_number', 0, 0),
b'type' : (1, 'scalar_text', 0, 0),
'info' : (0, 'number', 1, 0),
b'info' : (0, 'number', 1, 0),
'info.word' : (0, 'number', 1, 1),
'info.word.ocrText' : (1, 'text', 0, 0),
'info.word.firstGlyph' : (1, 'raw', 0, 0),
'info.word.lastGlyph' : (1, 'raw', 0, 0),
'info.word.bl' : (1, 'raw', 0, 0),
'info.word.link_id' : (1, 'number', 0, 0),
b'info.word' : (0, 'number', 1, 1),
b'info.word.ocrText' : (1, 'text', 0, 0),
b'info.word.firstGlyph' : (1, 'raw', 0, 0),
b'info.word.lastGlyph' : (1, 'raw', 0, 0),
b'info.word.bl' : (1, 'raw', 0, 0),
b'info.word.link_id' : (1, 'number', 0, 0),
'glyph' : (0, 'number', 1, 1),
'glyph.x' : (1, 'number', 0, 0),
'glyph.y' : (1, 'number', 0, 0),
'glyph.glyphID' : (1, 'number', 0, 0),
b'glyph' : (0, 'number', 1, 1),
b'glyph.x' : (1, 'number', 0, 0),
b'glyph.y' : (1, 'number', 0, 0),
b'glyph.glyphID' : (1, 'number', 0, 0),
'dehyphen' : (0, 'number', 1, 1),
'dehyphen.rootID' : (1, 'number', 0, 0),
'dehyphen.stemID' : (1, 'number', 0, 0),
'dehyphen.stemPage' : (1, 'number', 0, 0),
'dehyphen.sh' : (1, 'number', 0, 0),
b'dehyphen' : (0, 'number', 1, 1),
b'dehyphen.rootID' : (1, 'number', 0, 0),
b'dehyphen.stemID' : (1, 'number', 0, 0),
b'dehyphen.stemPage' : (1, 'number', 0, 0),
b'dehyphen.sh' : (1, 'number', 0, 0),
'links' : (0, 'number', 1, 1),
'links.page' : (1, 'number', 0, 0),
'links.rel' : (1, 'number', 0, 0),
'links.row' : (1, 'number', 0, 0),
'links.title' : (1, 'text', 0, 0),
'links.href' : (1, 'text', 0, 0),
'links.type' : (1, 'text', 0, 0),
'links.id' : (1, 'number', 0, 0),
b'links' : (0, 'number', 1, 1),
b'links.page' : (1, 'number', 0, 0),
b'links.rel' : (1, 'number', 0, 0),
b'links.row' : (1, 'number', 0, 0),
b'links.title' : (1, 'text', 0, 0),
b'links.href' : (1, 'text', 0, 0),
b'links.type' : (1, 'text', 0, 0),
b'links.id' : (1, 'number', 0, 0),
'paraCont' : (0, 'number', 1, 1),
'paraCont.rootID' : (1, 'number', 0, 0),
'paraCont.stemID' : (1, 'number', 0, 0),
'paraCont.stemPage' : (1, 'number', 0, 0),
b'paraCont' : (0, 'number', 1, 1),
b'paraCont.rootID' : (1, 'number', 0, 0),
b'paraCont.stemID' : (1, 'number', 0, 0),
b'paraCont.stemPage' : (1, 'number', 0, 0),
'paraStems' : (0, 'number', 1, 1),
'paraStems.stemID' : (1, 'number', 0, 0),
b'paraStems' : (0, 'number', 1, 1),
b'paraStems.stemID' : (1, 'number', 0, 0),
'wordStems' : (0, 'number', 1, 1),
'wordStems.stemID' : (1, 'number', 0, 0),
b'wordStems' : (0, 'number', 1, 1),
b'wordStems.stemID' : (1, 'number', 0, 0),
'empty' : (1, 'snippets', 1, 0),
b'empty' : (1, 'snippets', 1, 0),
'page' : (1, 'snippets', 1, 0),
'page.class' : (1, 'scalar_text', 0, 0),
'page.pageid' : (1, 'scalar_text', 0, 0),
'page.pagelabel' : (1, 'scalar_text', 0, 0),
'page.type' : (1, 'scalar_text', 0, 0),
'page.h' : (1, 'scalar_number', 0, 0),
'page.w' : (1, 'scalar_number', 0, 0),
'page.startID' : (1, 'scalar_number', 0, 0),
b'page' : (1, 'snippets', 1, 0),
b'page.class' : (1, 'scalar_text', 0, 0),
b'page.pageid' : (1, 'scalar_text', 0, 0),
b'page.pagelabel' : (1, 'scalar_text', 0, 0),
b'page.type' : (1, 'scalar_text', 0, 0),
b'page.h' : (1, 'scalar_number', 0, 0),
b'page.w' : (1, 'scalar_number', 0, 0),
b'page.startID' : (1, 'scalar_number', 0, 0),
'group' : (1, 'snippets', 1, 0),
'group.class' : (1, 'scalar_text', 0, 0),
'group.type' : (1, 'scalar_text', 0, 0),
'group._tag' : (1, 'scalar_text', 0, 0),
'group.orientation': (1, 'scalar_text', 0, 0),
b'group' : (1, 'snippets', 1, 0),
b'group.class' : (1, 'scalar_text', 0, 0),
b'group.type' : (1, 'scalar_text', 0, 0),
b'group._tag' : (1, 'scalar_text', 0, 0),
b'group.orientation': (1, 'scalar_text', 0, 0),
'region' : (1, 'snippets', 1, 0),
'region.class' : (1, 'scalar_text', 0, 0),
'region.type' : (1, 'scalar_text', 0, 0),
'region.x' : (1, 'scalar_number', 0, 0),
'region.y' : (1, 'scalar_number', 0, 0),
'region.h' : (1, 'scalar_number', 0, 0),
'region.w' : (1, 'scalar_number', 0, 0),
'region.orientation' : (1, 'scalar_text', 0, 0),
b'region' : (1, 'snippets', 1, 0),
b'region.class' : (1, 'scalar_text', 0, 0),
b'region.type' : (1, 'scalar_text', 0, 0),
b'region.x' : (1, 'scalar_number', 0, 0),
b'region.y' : (1, 'scalar_number', 0, 0),
b'region.h' : (1, 'scalar_number', 0, 0),
b'region.w' : (1, 'scalar_number', 0, 0),
b'region.orientation' : (1, 'scalar_text', 0, 0),
'empty_text_region' : (1, 'snippets', 1, 0),
b'empty_text_region' : (1, 'snippets', 1, 0),
'img' : (1, 'snippets', 1, 0),
'img.x' : (1, 'scalar_number', 0, 0),
'img.y' : (1, 'scalar_number', 0, 0),
'img.h' : (1, 'scalar_number', 0, 0),
'img.w' : (1, 'scalar_number', 0, 0),
'img.src' : (1, 'scalar_number', 0, 0),
'img.color_src' : (1, 'scalar_number', 0, 0),
'img.gridSize' : (1, 'scalar_number', 0, 0),
'img.gridBottomCenter' : (1, 'scalar_number', 0, 0),
'img.gridTopCenter' : (1, 'scalar_number', 0, 0),
'img.gridBeginCenter' : (1, 'scalar_number', 0, 0),
'img.gridEndCenter' : (1, 'scalar_number', 0, 0),
'img.image_type' : (1, 'scalar_number', 0, 0),
b'img' : (1, 'snippets', 1, 0),
b'img.x' : (1, 'scalar_number', 0, 0),
b'img.y' : (1, 'scalar_number', 0, 0),
b'img.h' : (1, 'scalar_number', 0, 0),
b'img.w' : (1, 'scalar_number', 0, 0),
b'img.src' : (1, 'scalar_number', 0, 0),
b'img.color_src' : (1, 'scalar_number', 0, 0),
b'img.gridSize' : (1, 'scalar_number', 0, 0),
b'img.gridBottomCenter' : (1, 'scalar_number', 0, 0),
b'img.gridTopCenter' : (1, 'scalar_number', 0, 0),
b'img.gridBeginCenter' : (1, 'scalar_number', 0, 0),
b'img.gridEndCenter' : (1, 'scalar_number', 0, 0),
b'img.image_type' : (1, 'scalar_number', 0, 0),
'paragraph' : (1, 'snippets', 1, 0),
'paragraph.class' : (1, 'scalar_text', 0, 0),
'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
'paragraph.gridSize' : (1, 'scalar_number', 0, 0),
'paragraph.gridBottomCenter' : (1, 'scalar_number', 0, 0),
'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0),
'paragraph.gridBeginCenter' : (1, 'scalar_number', 0, 0),
'paragraph.gridEndCenter' : (1, 'scalar_number', 0, 0),
b'paragraph' : (1, 'snippets', 1, 0),
b'paragraph.class' : (1, 'scalar_text', 0, 0),
b'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
b'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
b'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
b'paragraph.gridSize' : (1, 'scalar_number', 0, 0),
b'paragraph.gridBottomCenter' : (1, 'scalar_number', 0, 0),
b'paragraph.gridTopCenter' : (1, 'scalar_number', 0, 0),
b'paragraph.gridBeginCenter' : (1, 'scalar_number', 0, 0),
b'paragraph.gridEndCenter' : (1, 'scalar_number', 0, 0),
'word_semantic' : (1, 'snippets', 1, 1),
'word_semantic.type' : (1, 'scalar_text', 0, 0),
'word_semantic.class' : (1, 'scalar_text', 0, 0),
'word_semantic.firstWord' : (1, 'scalar_number', 0, 0),
'word_semantic.lastWord' : (1, 'scalar_number', 0, 0),
'word_semantic.gridBottomCenter' : (1, 'scalar_number', 0, 0),
'word_semantic.gridTopCenter' : (1, 'scalar_number', 0, 0),
'word_semantic.gridBeginCenter' : (1, 'scalar_number', 0, 0),
'word_semantic.gridEndCenter' : (1, 'scalar_number', 0, 0),
b'word_semantic' : (1, 'snippets', 1, 1),
b'word_semantic.type' : (1, 'scalar_text', 0, 0),
b'word_semantic.class' : (1, 'scalar_text', 0, 0),
b'word_semantic.firstWord' : (1, 'scalar_number', 0, 0),
b'word_semantic.lastWord' : (1, 'scalar_number', 0, 0),
b'word_semantic.gridBottomCenter' : (1, 'scalar_number', 0, 0),
b'word_semantic.gridTopCenter' : (1, 'scalar_number', 0, 0),
b'word_semantic.gridBeginCenter' : (1, 'scalar_number', 0, 0),
b'word_semantic.gridEndCenter' : (1, 'scalar_number', 0, 0),
'word' : (1, 'snippets', 1, 0),
'word.type' : (1, 'scalar_text', 0, 0),
'word.class' : (1, 'scalar_text', 0, 0),
'word.firstGlyph' : (1, 'scalar_number', 0, 0),
'word.lastGlyph' : (1, 'scalar_number', 0, 0),
b'word' : (1, 'snippets', 1, 0),
b'word.type' : (1, 'scalar_text', 0, 0),
b'word.class' : (1, 'scalar_text', 0, 0),
b'word.firstGlyph' : (1, 'scalar_number', 0, 0),
b'word.lastGlyph' : (1, 'scalar_number', 0, 0),
'_span' : (1, 'snippets', 1, 0),
'_span.class' : (1, 'scalar_text', 0, 0),
'_span.firstWord' : (1, 'scalar_number', 0, 0),
'_span.lastWord' : (1, 'scalar_number', 0, 0),
'_span.gridSize' : (1, 'scalar_number', 0, 0),
'_span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
'_span.gridTopCenter' : (1, 'scalar_number', 0, 0),
'_span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
'_span.gridEndCenter' : (1, 'scalar_number', 0, 0),
b'_span' : (1, 'snippets', 1, 0),
b'_span.class' : (1, 'scalar_text', 0, 0),
b'_span.firstWord' : (1, 'scalar_number', 0, 0),
b'_span.lastWord' : (1, 'scalar_number', 0, 0),
b'_span.gridSize' : (1, 'scalar_number', 0, 0),
b'_span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
b'_span.gridTopCenter' : (1, 'scalar_number', 0, 0),
b'_span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
b'_span.gridEndCenter' : (1, 'scalar_number', 0, 0),
'span' : (1, 'snippets', 1, 0),
'span.firstWord' : (1, 'scalar_number', 0, 0),
'span.lastWord' : (1, 'scalar_number', 0, 0),
'span.gridSize' : (1, 'scalar_number', 0, 0),
'span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
'span.gridTopCenter' : (1, 'scalar_number', 0, 0),
'span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
'span.gridEndCenter' : (1, 'scalar_number', 0, 0),
b'span' : (1, 'snippets', 1, 0),
b'span.firstWord' : (1, 'scalar_number', 0, 0),
b'span.lastWord' : (1, 'scalar_number', 0, 0),
b'span.gridSize' : (1, 'scalar_number', 0, 0),
b'span.gridBottomCenter' : (1, 'scalar_number', 0, 0),
b'span.gridTopCenter' : (1, 'scalar_number', 0, 0),
b'span.gridBeginCenter' : (1, 'scalar_number', 0, 0),
b'span.gridEndCenter' : (1, 'scalar_number', 0, 0),
'extratokens' : (1, 'snippets', 1, 0),
'extratokens.class' : (1, 'scalar_text', 0, 0),
'extratokens.type' : (1, 'scalar_text', 0, 0),
'extratokens.firstGlyph' : (1, 'scalar_number', 0, 0),
'extratokens.lastGlyph' : (1, 'scalar_number', 0, 0),
'extratokens.gridSize' : (1, 'scalar_number', 0, 0),
'extratokens.gridBottomCenter' : (1, 'scalar_number', 0, 0),
'extratokens.gridTopCenter' : (1, 'scalar_number', 0, 0),
'extratokens.gridBeginCenter' : (1, 'scalar_number', 0, 0),
'extratokens.gridEndCenter' : (1, 'scalar_number', 0, 0),
b'extratokens' : (1, 'snippets', 1, 0),
b'extratokens.class' : (1, 'scalar_text', 0, 0),
b'extratokens.type' : (1, 'scalar_text', 0, 0),
b'extratokens.firstGlyph' : (1, 'scalar_number', 0, 0),
b'extratokens.lastGlyph' : (1, 'scalar_number', 0, 0),
b'extratokens.gridSize' : (1, 'scalar_number', 0, 0),
b'extratokens.gridBottomCenter' : (1, 'scalar_number', 0, 0),
b'extratokens.gridTopCenter' : (1, 'scalar_number', 0, 0),
b'extratokens.gridBeginCenter' : (1, 'scalar_number', 0, 0),
b'extratokens.gridEndCenter' : (1, 'scalar_number', 0, 0),
'glyph.h' : (1, 'number', 0, 0),
'glyph.w' : (1, 'number', 0, 0),
'glyph.use' : (1, 'number', 0, 0),
'glyph.vtx' : (1, 'number', 0, 1),
'glyph.len' : (1, 'number', 0, 1),
'glyph.dpi' : (1, 'number', 0, 0),
'vtx' : (0, 'number', 1, 1),
'vtx.x' : (1, 'number', 0, 0),
'vtx.y' : (1, 'number', 0, 0),
'len' : (0, 'number', 1, 1),
'len.n' : (1, 'number', 0, 0),
b'glyph.h' : (1, 'number', 0, 0),
b'glyph.w' : (1, 'number', 0, 0),
b'glyph.use' : (1, 'number', 0, 0),
b'glyph.vtx' : (1, 'number', 0, 1),
b'glyph.len' : (1, 'number', 0, 1),
b'glyph.dpi' : (1, 'number', 0, 0),
b'vtx' : (0, 'number', 1, 1),
b'vtx.x' : (1, 'number', 0, 0),
b'vtx.y' : (1, 'number', 0, 0),
b'len' : (0, 'number', 1, 1),
b'len.n' : (1, 'number', 0, 0),
'book' : (1, 'snippets', 1, 0),
'version' : (1, 'snippets', 1, 0),
'version.FlowEdit_1_id' : (1, 'scalar_text', 0, 0),
'version.FlowEdit_1_version' : (1, 'scalar_text', 0, 0),
'version.Schema_id' : (1, 'scalar_text', 0, 0),
'version.Schema_version' : (1, 'scalar_text', 0, 0),
'version.Topaz_version' : (1, 'scalar_text', 0, 0),
'version.WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0),
'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0),
'version.ZoneEdit_1_id' : (1, 'scalar_text', 0, 0),
'version.ZoneEdit_1_version' : (1, 'scalar_text', 0, 0),
'version.chapterheaders' : (1, 'scalar_text', 0, 0),
'version.creation_date' : (1, 'scalar_text', 0, 0),
'version.header_footer' : (1, 'scalar_text', 0, 0),
'version.init_from_ocr' : (1, 'scalar_text', 0, 0),
'version.letter_insertion' : (1, 'scalar_text', 0, 0),
'version.xmlinj_convert' : (1, 'scalar_text', 0, 0),
'version.xmlinj_reflow' : (1, 'scalar_text', 0, 0),
'version.xmlinj_transform' : (1, 'scalar_text', 0, 0),
'version.findlists' : (1, 'scalar_text', 0, 0),
'version.page_num' : (1, 'scalar_text', 0, 0),
'version.page_type' : (1, 'scalar_text', 0, 0),
'version.bad_text' : (1, 'scalar_text', 0, 0),
'version.glyph_mismatch' : (1, 'scalar_text', 0, 0),
'version.margins' : (1, 'scalar_text', 0, 0),
'version.staggered_lines' : (1, 'scalar_text', 0, 0),
'version.paragraph_continuation' : (1, 'scalar_text', 0, 0),
'version.toc' : (1, 'scalar_text', 0, 0),
b'book' : (1, 'snippets', 1, 0),
b'version' : (1, 'snippets', 1, 0),
b'version.FlowEdit_1_id' : (1, 'scalar_text', 0, 0),
b'version.FlowEdit_1_version' : (1, 'scalar_text', 0, 0),
b'version.Schema_id' : (1, 'scalar_text', 0, 0),
b'version.Schema_version' : (1, 'scalar_text', 0, 0),
b'version.Topaz_version' : (1, 'scalar_text', 0, 0),
b'version.WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0),
b'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0),
b'version.ZoneEdit_1_id' : (1, 'scalar_text', 0, 0),
b'version.ZoneEdit_1_version' : (1, 'scalar_text', 0, 0),
b'version.chapterheaders' : (1, 'scalar_text', 0, 0),
b'version.creation_date' : (1, 'scalar_text', 0, 0),
b'version.header_footer' : (1, 'scalar_text', 0, 0),
b'version.init_from_ocr' : (1, 'scalar_text', 0, 0),
b'version.letter_insertion' : (1, 'scalar_text', 0, 0),
b'version.xmlinj_convert' : (1, 'scalar_text', 0, 0),
b'version.xmlinj_reflow' : (1, 'scalar_text', 0, 0),
b'version.xmlinj_transform' : (1, 'scalar_text', 0, 0),
b'version.findlists' : (1, 'scalar_text', 0, 0),
b'version.page_num' : (1, 'scalar_text', 0, 0),
b'version.page_type' : (1, 'scalar_text', 0, 0),
b'version.bad_text' : (1, 'scalar_text', 0, 0),
b'version.glyph_mismatch' : (1, 'scalar_text', 0, 0),
b'version.margins' : (1, 'scalar_text', 0, 0),
b'version.staggered_lines' : (1, 'scalar_text', 0, 0),
b'version.paragraph_continuation' : (1, 'scalar_text', 0, 0),
b'version.toc' : (1, 'scalar_text', 0, 0),
'stylesheet' : (1, 'snippets', 1, 0),
'style' : (1, 'snippets', 1, 0),
'style._tag' : (1, 'scalar_text', 0, 0),
'style.type' : (1, 'scalar_text', 0, 0),
'style._after_type' : (1, 'scalar_text', 0, 0),
'style._parent_type' : (1, 'scalar_text', 0, 0),
'style._after_parent_type' : (1, 'scalar_text', 0, 0),
'style.class' : (1, 'scalar_text', 0, 0),
'style._after_class' : (1, 'scalar_text', 0, 0),
'rule' : (1, 'snippets', 1, 0),
'rule.attr' : (1, 'scalar_text', 0, 0),
'rule.value' : (1, 'scalar_text', 0, 0),
b'stylesheet' : (1, 'snippets', 1, 0),
b'style' : (1, 'snippets', 1, 0),
b'style._tag' : (1, 'scalar_text', 0, 0),
b'style.type' : (1, 'scalar_text', 0, 0),
b'style._after_type' : (1, 'scalar_text', 0, 0),
b'style._parent_type' : (1, 'scalar_text', 0, 0),
b'style._after_parent_type' : (1, 'scalar_text', 0, 0),
b'style.class' : (1, 'scalar_text', 0, 0),
b'style._after_class' : (1, 'scalar_text', 0, 0),
b'rule' : (1, 'snippets', 1, 0),
b'rule.attr' : (1, 'scalar_text', 0, 0),
b'rule.value' : (1, 'scalar_text', 0, 0),
'original' : (0, 'number', 1, 1),
'original.pnum' : (1, 'number', 0, 0),
'original.pid' : (1, 'text', 0, 0),
'pages' : (0, 'number', 1, 1),
'pages.ref' : (1, 'number', 0, 0),
'pages.id' : (1, 'number', 0, 0),
'startID' : (0, 'number', 1, 1),
'startID.page' : (1, 'number', 0, 0),
'startID.id' : (1, 'number', 0, 0),
b'original' : (0, 'number', 1, 1),
b'original.pnum' : (1, 'number', 0, 0),
b'original.pid' : (1, 'text', 0, 0),
b'pages' : (0, 'number', 1, 1),
b'pages.ref' : (1, 'number', 0, 0),
b'pages.id' : (1, 'number', 0, 0),
b'startID' : (0, 'number', 1, 1),
b'startID.page' : (1, 'number', 0, 0),
b'startID.id' : (1, 'number', 0, 0),
'median_d' : (1, 'number', 0, 0),
'median_h' : (1, 'number', 0, 0),
'median_firsty' : (1, 'number', 0, 0),
'median_lasty' : (1, 'number', 0, 0),
b'median_d' : (1, 'number', 0, 0),
b'median_h' : (1, 'number', 0, 0),
b'median_firsty' : (1, 'number', 0, 0),
b'median_lasty' : (1, 'number', 0, 0),
'num_footers_maybe' : (1, 'number', 0, 0),
'num_footers_yes' : (1, 'number', 0, 0),
'num_headers_maybe' : (1, 'number', 0, 0),
'num_headers_yes' : (1, 'number', 0, 0),
b'num_footers_maybe' : (1, 'number', 0, 0),
b'num_footers_yes' : (1, 'number', 0, 0),
b'num_headers_maybe' : (1, 'number', 0, 0),
b'num_headers_yes' : (1, 'number', 0, 0),
'tracking' : (1, 'number', 0, 0),
'src' : (1, 'text', 0, 0),
b'tracking' : (1, 'number', 0, 0),
b'src' : (1, 'text', 0, 0),
}
@@ -430,7 +430,7 @@ class PageParser(object):
cnt = len(self.tagpath)
if i < cnt : result = self.tagpath[i]
for j in range(i+1, cnt) :
result += '.' + self.tagpath[j]
result += b'.' + self.tagpath[j]
return result
@@ -505,7 +505,7 @@ class PageParser(object):
if (subtags == 1):
ntags = readEncodedNumber(self.fo)
if self.debug : print('subtags: ' + token + ' has ' + str(ntags))
if self.debug : print('subtags: ', token , ' has ' , str(ntags))
for j in range(ntags):
val = readEncodedNumber(self.fo)
subtagres.append(self.procToken(self.dict.lookup(val)))
@@ -613,7 +613,7 @@ class PageParser(object):
subtagList = tag[1]
argtype = tag[2]
argList = tag[3]
nname = prefix + '.' + name
nname = prefix + b'.' + name
nsubtaglist = []
for j in subtagList:
nsubtaglist.append(self.updateName(j,prefix))
@@ -662,34 +662,34 @@ class PageParser(object):
subtagList = node[1]
argtype = node[2]
argList = node[3]
fullpathname = name.split('.')
fullpathname = name.split(b'.')
nodename = fullpathname.pop()
ilvl = len(fullpathname)
indent = ' ' * (3 * ilvl)
indent = b' ' * (3 * ilvl)
rlst = []
rlst.append(indent + '<' + nodename + '>')
rlst.append(indent + b'<' + nodename + b'>')
if len(argList) > 0:
alst = []
for j in argList:
if (argtype == 'text') or (argtype == 'scalar_text') :
alst.append(j + '|')
if (argtype == b'text') or (argtype == b'scalar_text') :
alst.append(j + b'|')
else :
alst.append(str(j) + ',')
argres = "".join(alst)
alst.append(str(j).encode('utf-8') + b',')
argres = b"".join(alst)
argres = argres[0:-1]
if argtype == 'snippets' :
rlst.append('snippets:' + argres)
if argtype == b'snippets' :
rlst.append(b'snippets:' + argres)
else :
rlst.append(argres)
if len(subtagList) > 0 :
rlst.append('\n')
rlst.append(b'\n')
for j in subtagList:
if len(j) > 0 :
rlst.append(self.formatTag(j))
rlst.append(indent + '</' + nodename + '>\n')
rlst.append(indent + b'</' + nodename + b'>\n')
else:
rlst.append('</' + nodename + '>\n')
return "".join(rlst)
rlst.append(b'</' + nodename + b'>\n')
return b"".join(rlst)
# flatten tag
@@ -704,20 +704,20 @@ class PageParser(object):
alst = []
for j in argList:
if (argtype == 'text') or (argtype == 'scalar_text') :
alst.append(j + '|')
alst.append(j + b'|')
else :
alst.append(str(j) + '|')
argres = "".join(alst)
alst.append(str(j).encode('utf-8') + b'|')
argres = b"".join(alst)
argres = argres[0:-1]
if argtype == 'snippets' :
rlst.append('.snippets=' + argres)
if argtype == b'snippets' :
rlst.append(b'.snippets=' + argres)
else :
rlst.append('=' + argres)
rlst.append('\n')
rlst.append(b'=' + argres)
rlst.append(b'\n')
for j in subtagList:
if len(j) > 0 :
rlst.append(self.flattenTag(j))
return "".join(rlst)
return b"".join(rlst)
# reduce create xml output
@@ -729,7 +729,7 @@ class PageParser(object):
rlst.append(self.flattenTag(j))
else:
rlst.append(self.formatTag(j))
result = "".join(rlst)
result = b"".join(rlst)
if self.debug : print(result)
return result
@@ -747,16 +747,16 @@ class PageParser(object):
# peek at the first bytes to see what type of file it is
magic = self.fo.read(9)
if (magic[0:1] == 'p') and (magic[2:9] == 'marker_'):
first_token = 'info'
elif (magic[0:1] == 'p') and (magic[2:9] == '__PAGE_'):
if (magic[0:1] == b'p') and (magic[2:9] == b'marker_'):
first_token = b'info'
elif (magic[0:1] == b'p') and (magic[2:9] == b'__PAGE_'):
skip = self.fo.read(2)
first_token = 'info'
elif (magic[0:1] == 'p') and (magic[2:8] == '_PAGE_'):
first_token = 'info'
elif (magic[0:1] == 'g') and (magic[2:9] == '__GLYPH'):
first_token = b'info'
elif (magic[0:1] == b'p') and (magic[2:8] == b'_PAGE_'):
first_token = b'info'
elif (magic[0:1] == b'g') and (magic[2:9] == b'__GLYPH'):
skip = self.fo.read(3)
first_token = 'info'
first_token = b'info'
else :
# other0.dat file
first_token = None
@@ -778,7 +778,7 @@ class PageParser(object):
break
if (v == 0x72):
self.doLoop72('number')
self.doLoop72(b'number')
elif (v > 0) and (v < self.dict.getSize()) :
tag = self.procToken(self.dict.lookup(v))
if len(tag) > 0 :
@@ -789,7 +789,7 @@ class PageParser(object):
if (v == 0):
if (self.peek(1) == 0x5f):
skip = self.fo.read(1)
first_token = 'info'
first_token = b'info'
# now do snippet injection
if len(self.snippetList) > 0 :
@@ -809,14 +809,14 @@ class PageParser(object):
def fromData(dict, fname):
flat_xml = True
debug = False
debug = True
pp = PageParser(fname, dict, debug, flat_xml)
xmlpage = pp.process()
return xmlpage
def getXML(dict, fname):
flat_xml = False
debug = False
debug = True
pp = PageParser(fname, dict, debug, flat_xml)
xmlpage = pp.process()
return xmlpage
@@ -845,7 +845,7 @@ def main(argv):
sys.stderr=SafeUnbuffered(sys.stderr)
dictFile = ""
pageFile = ""
debug = False
debug = True
flat_xml = False
printOutput = False
if len(argv) == 0: