Compare commits

...

3 Commits
v1.0 ... v1.1

Author SHA1 Message Date
Apprentice Alf
2819550411 tools v1.1 2015-03-02 07:32:21 +00:00
Anonymous
f8154c4615 ineptpdf 5 by anon 2015-02-28 14:38:24 +00:00
i♥cabbages
58833e7dc5 Unknown date, late 2009/early 2010 2015-02-28 14:35:29 +00:00
3 changed files with 320 additions and 109 deletions

View File

@@ -336,5 +336,6 @@ def gui_main():
return 0 return 0
if __name__ == '__main__': if __name__ == '__main__':
# sys.exit(cli_main()) if len(sys.argv) > 1:
sys.exit(cli_main())
sys.exit(gui_main()) sys.exit(gui_main())

View File

@@ -346,35 +346,40 @@ class DocParser(object):
if end == -1 : if end == -1 :
end = self.docSize end = self.docSize
# seems some xml has last* coming before first* so we have to
# handle any order
sp_first = -1
sp_last = -1
gl_first = -1
gl_last = -1
ws_first = -1
ws_last = -1
word_class = ''
while (line < end) : while (line < end) :
(name, argres) = self.lineinDoc(line) (name, argres) = self.lineinDoc(line)
# handle both span and _span
if name.endswith('span.firstWord') : if name.endswith('span.firstWord') :
first = int(argres) sp_first = int(argres)
(name, argres) = self.lineinDoc(line+1)
if not name.endswith('span.lastWord'): elif name.endswith('span.lastWord') :
print 'Error: - incorrect _span ordering inside paragraph' sp_last = int(argres)
last = int(argres)
for wordnum in xrange(first, last):
result.append(('ocr', wordnum))
line += 1
elif name.endswith('word.firstGlyph') : elif name.endswith('word.firstGlyph') :
first = int(argres) gl_first = int(argres)
(name, argres) = self.lineinDoc(line+1)
if not name.endswith('word.lastGlyph'): elif name.endswith('word.lastGlyph') :
print 'Error: - incorrect glyph ordering inside word in paragraph' gl_last = int(argres)
last = int(argres)
glyphList = [] elif name.endswith('word_semantic.firstWord'):
for glyphnum in xrange(first, last): ws_first = int(argres)
glyphList.append(glyphnum)
num = self.svgcount elif name.endswith('word_semantic.lastWord'):
self.glyphs_to_image(glyphList) ws_last = int(argres)
self.svgcount += 1
result.append(('svg', num))
line += 1
elif name.endswith('word.class'): elif name.endswith('word.class'):
(cname, space) = argres.split('-',1) (cname, space) = argres.split('-',1)
@@ -386,15 +391,28 @@ class DocParser(object):
result.append(('img' + word_class, int(argres))) result.append(('img' + word_class, int(argres)))
word_class = '' word_class = ''
elif name.endswith('word_semantic.firstWord'): if (sp_first != -1) and (sp_last != -1):
first = int(argres) for wordnum in xrange(sp_first, sp_last):
(name, argres) = self.lineinDoc(line+1)
if not name.endswith('word_semantic.lastWord'):
print 'Error: - incorrect word_semantic ordering inside paragraph'
last = int(argres)
for wordnum in xrange(first, last):
result.append(('ocr', wordnum)) result.append(('ocr', wordnum))
line += 1 sp_first = -1
sp_last = -1
if (gl_first != -1) and (gl_last != -1):
glyphList = []
for glyphnum in xrange(gl_first, gl_last):
glyphList.append(glyphnum)
num = self.svgcount
self.glyphs_to_image(glyphList)
self.svgcount += 1
result.append(('svg', num))
gl_first = -1
gl_last = -1
if (ws_first != -1) and (ws_last != -1):
for wordnum in xrange(ws_first, ws_last):
result.append(('ocr', wordnum))
ws_first = -1
ws_last = -1
line += 1 line += 1

View File

@@ -1,6 +1,6 @@
#! /usr/bin/python #! /usr/bin/python
# ineptpdf.pyw, version 2 # ineptpdf5.pyw, version 5
# To run this program install Python 2.6 from http://www.python.org/download/ # To run this program install Python 2.6 from http://www.python.org/download/
# and PyCrypto from http://www.voidspace.org.uk/python/modules.shtml#pycrypto # and PyCrypto from http://www.voidspace.org.uk/python/modules.shtml#pycrypto
@@ -10,6 +10,9 @@
# Revision history: # Revision history:
# 1 - Initial release # 1 - Initial release
# 2 - Improved determination of key-generation algorithm # 2 - Improved determination of key-generation algorithm
# 3 - Correctly handle PDF >=1.5 cross-reference streams
# 4 - Removal of ciando's personal ID (anon)
# 5 - removing small bug with V3 ebooks (anon)
""" """
Decrypt Adobe ADEPT-encrypted PDF files. Decrypt Adobe ADEPT-encrypted PDF files.
@@ -25,7 +28,7 @@ import re
import zlib import zlib
import struct import struct
import hashlib import hashlib
from itertools import chain from itertools import chain, islice
import xml.etree.ElementTree as etree import xml.etree.ElementTree as etree
import Tkinter import Tkinter
import Tkconstants import Tkconstants
@@ -163,16 +166,16 @@ def nunpack(s, default=0):
elif l == 1: elif l == 1:
return ord(s) return ord(s)
elif l == 2: elif l == 2:
return unpack('>H', s)[0] return struct.unpack('>H', s)[0]
elif l == 3: elif l == 3:
return unpack('>L', '\x00'+s)[0] return struct.unpack('>L', '\x00'+s)[0]
elif l == 4: elif l == 4:
return unpack('>L', s)[0] return struct.unpack('>L', s)[0]
else: else:
return TypeError('invalid length: %d' % l) return TypeError('invalid length: %d' % l)
STRICT = 0 STRICT = 1
## PS Exceptions ## PS Exceptions
@@ -680,6 +683,12 @@ class PSStackParser(PSBaseParser):
return obj return obj
LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'), PSLiteralTable.intern('Fl'))
LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'), PSLiteralTable.intern('LZW'))
LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'), PSLiteralTable.intern('A85'))
## PDF Objects ## PDF Objects
## ##
class PDFObject(PSObject): pass class PDFObject(PSObject): pass
@@ -741,11 +750,11 @@ def decipher_all(decipher, objid, genno, x):
''' '''
if isinstance(x, str): if isinstance(x, str):
return decipher(objid, genno, x) return decipher(objid, genno, x)
decf = lambda v: decipher_all(decipher, objid, genno, v)
if isinstance(x, list): if isinstance(x, list):
x = [ decipher_all(decipher, objid, genno, v) for v in x ] x = [decf(v) for v in x]
elif isinstance(x, dict): elif isinstance(x, dict):
for (k,v) in x.iteritems(): x = dict((k, decf(v)) for (k, v) in x.iteritems())
x[k] = decipher_all(decipher, objid, genno, v)
return x return x
# Type cheking # Type cheking
@@ -805,6 +814,28 @@ def stream_value(x):
return PDFStream({}, '') return PDFStream({}, '')
return x return x
# ascii85decode(data)
def ascii85decode(data):
n = b = 0
out = ''
for c in data:
if '!' <= c and c <= 'u':
n += 1
b = b*85+(ord(c)-33)
if n == 5:
out += struct.pack('>L',b)
n = b = 0
elif c == 'z':
assert n == 0
out += '\0\0\0\0'
elif c == '~':
if n:
for _ in range(5-n):
b = b*85+84
out += struct.pack('>L',b)[:n-1]
break
return out
## PDFStream type ## PDFStream type
## ##
@@ -834,12 +865,76 @@ class PDFStream(PDFObject):
return '<PDFStream(%r): raw=%d, %r>' % \ return '<PDFStream(%r): raw=%d, %r>' % \
(self.objid, len(self.rawdata), self.dic) (self.objid, len(self.rawdata), self.dic)
def decode(self):
assert self.data == None and self.rawdata != None
data = self.rawdata
if self.decipher:
# Handle encryption
data = self.decipher(self.objid, self.genno, data)
if 'Filter' not in self.dic:
self.data = data
self.rawdata = None
return
filters = self.dic['Filter']
if not isinstance(filters, list):
filters = [ filters ]
for f in filters:
if f in LITERALS_FLATE_DECODE:
# will get errors if the document is encrypted.
data = zlib.decompress(data)
elif f in LITERALS_LZW_DECODE:
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
data = ''.join(LZWDecoder(StringIO(data)).run())
elif f in LITERALS_ASCII85_DECODE:
data = ascii85decode(data)
elif f == LITERAL_CRYPT:
raise PDFNotImplementedError('/Crypt filter is unsupported')
else:
raise PDFNotImplementedError('Unsupported filter: %r' % f)
# apply predictors
if 'DP' in self.dic:
params = self.dic['DP']
else:
params = self.dic.get('DecodeParms', {})
if 'Predictor' in params:
pred = int_value(params['Predictor'])
if pred:
if pred != 12:
raise PDFNotImplementedError(
'Unsupported predictor: %r' % pred)
if 'Columns' not in params:
raise PDFValueError(
'Columns undefined for predictor=12')
columns = int_value(params['Columns'])
buf = ''
ent0 = '\x00' * columns
for i in xrange(0, len(data), columns+1):
pred = data[i]
ent1 = data[i+1:i+1+columns]
if pred == '\x02':
ent1 = ''.join(chr((ord(a)+ord(b)) & 255) \
for (a,b) in zip(ent0,ent1))
buf += ent1
ent0 = ent1
data = buf
self.data = data
self.rawdata = None
return
def get_data(self):
if self.data == None:
self.decode()
return self.data
def get_rawdata(self): def get_rawdata(self):
return self.rawdata return self.rawdata
def get_decdata(self): def get_decdata(self):
data = self.rawdata data = self.rawdata
if self.decipher: if self.decipher and data:
# Handle encryption # Handle encryption
data = self.decipher(self.objid, self.genno, data) data = self.decipher(self.objid, self.genno, data)
return data return data
@@ -932,6 +1027,66 @@ class PDFXRef(object):
return (None, pos) return (None, pos)
## PDFXRefStream
##
class PDFXRefStream(object):
def __init__(self):
self.index = None
self.data = None
self.entlen = None
self.fl1 = self.fl2 = self.fl3 = None
return
def __repr__(self):
return '<PDFXRef: objid=%d-%d>' % (self.objid_first, self.objid_last)
def objids(self):
for first, size in self.index:
for objid in xrange(first, first + size):
yield objid
def load(self, parser, debug=0):
(_,objid) = parser.nexttoken() # ignored
(_,genno) = parser.nexttoken() # ignored
(_,kwd) = parser.nexttoken()
(_,stream) = parser.nextobject()
if not isinstance(stream, PDFStream) or \
stream.dic['Type'] is not LITERAL_XREF:
raise PDFNoValidXRef('Invalid PDF stream spec.')
size = stream.dic['Size']
index = stream.dic.get('Index', (0,size))
self.index = zip(islice(index, 0, None, 2),
islice(index, 1, None, 2))
(self.fl1, self.fl2, self.fl3) = stream.dic['W']
self.data = stream.get_data()
self.entlen = self.fl1+self.fl2+self.fl3
self.trailer = stream.dic
return
def getpos(self, objid):
offset = 0
for first, size in self.index:
if first <= objid and objid < (first + size):
break
offset += size
else:
raise KeyError(objid)
i = self.entlen * ((objid - first) + offset)
ent = self.data[i:i+self.entlen]
f1 = nunpack(ent[:self.fl1], 1)
if f1 == 1:
pos = nunpack(ent[self.fl1:self.fl1+self.fl2])
genno = nunpack(ent[self.fl1+self.fl2:])
return (None, pos)
elif f1 == 2:
objid = nunpack(ent[self.fl1:self.fl1+self.fl2])
index = nunpack(ent[self.fl1+self.fl2:])
return (objid, index)
# this is a free object
raise KeyError(objid)
## PDFDocument ## PDFDocument
## ##
## A PDFDocument object represents a PDF document. ## A PDFDocument object represents a PDF document.
@@ -1020,7 +1175,7 @@ class PDFDocument(object):
key = ASN1Parser([ord(x) for x in keyder]) key = ASN1Parser([ord(x) for x in keyder])
key = [bytesToNumber(key.getChild(x).value) for x in xrange(1, 4)] key = [bytesToNumber(key.getChild(x).value) for x in xrange(1, 4)]
rsa = RSA.construct(key) rsa = RSA.construct(key)
length = int_value(param.get('Length')) / 8 length = int_value(param.get('Length', 0)) / 8
rights = str_value(param.get('ADEPT_LICENSE')).decode('base64') rights = str_value(param.get('ADEPT_LICENSE')).decode('base64')
rights = zlib.decompress(rights, -15) rights = zlib.decompress(rights, -15)
rights = etree.fromstring(rights) rights = etree.fromstring(rights)
@@ -1031,11 +1186,16 @@ class PDFDocument(object):
raise ADEPTError('error decrypting book session key') raise ADEPTError('error decrypting book session key')
index = bookkey.index('\0') + 1 index = bookkey.index('\0') + 1
bookkey = bookkey[index:] bookkey = bookkey[index:]
V = 2 ebx_V = int_value(param.get('V', 4))
if (length and len(bookkey) == (length + 1)) or \ ebx_type = int_value(param.get('EBX_ENCRYPTIONTYPE', 6))
(not length and len(bookkey) & 1 == 1): # added because of the booktype / decryption book session key error
if ebx_V == 3:
V = 3
elif ebx_V < 4 or ebx_type < 6:
V = ord(bookkey[0]) V = ord(bookkey[0])
bookkey = bookkey[1:] bookkey = bookkey[1:]
else:
V = 2
if length and len(bookkey) != length: if length and len(bookkey) != length:
raise ADEPTError('error decrypting book session key') raise ADEPTError('error decrypting book session key')
self.decrypt_key = bookkey self.decrypt_key = bookkey
@@ -1131,46 +1291,17 @@ class PDFDocument(object):
else: else:
for xref in self.xrefs: for xref in self.xrefs:
try: try:
(strmid, index) = xref.getpos(objid) (stmid, index) = xref.getpos(objid)
break break
except KeyError: except KeyError:
pass pass
else: else:
if STRICT: return
raise PDFSyntaxError('Cannot locate objid=%r' % objid) #if STRICT:
# raise PDFSyntaxError('Cannot locate objid=%r' % objid)
return None return None
if strmid: if stmid:
stream = stream_value(self.getobj(strmid)) return PDFObjStmRef(objid, stmid, index)
if stream.dic.get('Type') is not LITERAL_OBJSTM:
if STRICT:
raise PDFSyntaxError('Not a stream object: %r' % stream)
try:
n = stream.dic['N']
except KeyError:
if STRICT:
raise PDFSyntaxError('N is not defined: %r' % stream)
n = 0
if strmid in self.parsed_objs:
objs = self.parsed_objs[strmid]
else:
parser = PDFObjStrmParser(self, stream.get_data())
objs = []
try:
while 1:
(_,obj) = parser.nextobject()
objs.append(obj)
except PSEOF:
pass
self.parsed_objs[strmid] = objs
genno = 0
i = n*2+index
try:
obj = objs[i]
except IndexError:
raise PDFSyntaxError(
'Invalid object number: objid=%r' % (objid))
if isinstance(obj, PDFStream):
obj.set_objid(objid, 0)
else: else:
self.parser.seek(index) self.parser.seek(index)
(_,objid1) = self.parser.nexttoken() # objid (_,objid1) = self.parser.nexttoken() # objid
@@ -1188,6 +1319,12 @@ class PDFDocument(object):
obj = decipher_all(self.decipher, objid, genno, obj) obj = decipher_all(self.decipher, objid, genno, obj)
return obj return obj
class PDFObjStmRef(object):
def __init__(self, objid, stmid, index):
self.objid = objid
self.stmid = stmid
self.index = index
## PDFParser ## PDFParser
## ##
@@ -1290,6 +1427,13 @@ class PDFParser(PSStackParser):
(pos, token) = self.nexttoken() (pos, token) = self.nexttoken()
except PSEOF: except PSEOF:
raise PDFNoValidXRef('Unexpected EOF') raise PDFNoValidXRef('Unexpected EOF')
if isinstance(token, int):
# XRefStream: PDF-1.5
self.seek(pos)
self.reset()
xref = PDFXRefStream()
xref.load(self)
else:
if token is not self.KEYWORD_XREF: if token is not self.KEYWORD_XREF:
raise PDFNoValidXRef('xref not found: pos=%d, token=%r' % raise PDFNoValidXRef('xref not found: pos=%d, token=%r' %
(pos, token)) (pos, token))
@@ -1298,6 +1442,9 @@ class PDFParser(PSStackParser):
xref.load(self) xref.load(self)
xrefs.append(xref) xrefs.append(xref)
trailer = xref.trailer trailer = xref.trailer
if 'XRefStm' in trailer:
pos = int_value(trailer['XRefStm'])
self.read_xref_from(pos, xrefs)
if 'Prev' in trailer: if 'Prev' in trailer:
# find previous xref # find previous xref
pos = int_value(trailer['Prev']) pos = int_value(trailer['Prev'])
@@ -1345,10 +1492,13 @@ class PDFSerializer(object):
parser = PDFParser(doc, inf) parser = PDFParser(doc, inf)
doc.initialize(keypath) doc.initialize(keypath)
self.objids = objids = set() self.objids = objids = set()
for xref in doc.xrefs: for xref in reversed(doc.xrefs):
trailer = xref.trailer trailer = xref.trailer
for objid in xref.objids(): for objid in xref.objids():
objids.add(objid) objids.add(objid)
trailer = dict(trailer)
trailer.pop('Prev', None)
trailer.pop('XRefStm', None)
if 'Encrypt' in trailer: if 'Encrypt' in trailer:
objids.remove(trailer.pop('Encrypt').objid) objids.remove(trailer.pop('Encrypt').objid)
self.trailer = trailer self.trailer = trailer
@@ -1360,26 +1510,64 @@ class PDFSerializer(object):
doc = self.doc doc = self.doc
objids = self.objids objids = self.objids
xrefs = {} xrefs = {}
xrefstm = {}
maxobj = max(objids) maxobj = max(objids)
trailer = dict(self.trailer)
trailer['Size'] = maxobj + 1
for objid in objids: for objid in objids:
obj = doc.getobj(objid)
if isinstance(obj, PDFObjStmRef):
xrefstm[objid] = obj
continue
xrefs[objid] = self.tell() xrefs[objid] = self.tell()
self.serialize_indirect(objid, doc.getobj(objid)) self.serialize_indirect(objid, obj)
startxref = self.tell() startxref = self.tell()
self.write('xref\n') self.write('xref\n')
self.write('0 %d\n' % (maxobj + 1,)) self.write('0 %d\n' % (maxobj + 1,))
for objid in xrange(0, maxobj + 1): for objid in xrange(0, maxobj + 1):
if objid in objids: if objid in xrefs:
self.write("%010d %05d n \n" % (xrefs[objid], 0)) self.write("%010d %05d n \n" % (xrefs[objid], 0))
else: else:
self.write("%010d %05d f \n" % (0, 65535)) self.write("%010d %05d f \n" % (0, 65535))
self.write('trailer\n') self.write('trailer\n')
self.serialize_object(self.trailer) self.serialize_object(trailer)
self.write('\nstartxref\n%d\n%%%%EOF' % startxref)
if not xrefstm:
return
index = []
first = None
prev = None
data = []
for objid in sorted(xrefstm):
if first is None:
first = objid
elif objid != prev + 1:
index.extend((first, prev - first + 1))
first = objid
prev = objid
stmid = xrefstm[objid].stmid
data.append(struct.pack('>BHB', 2, stmid, 0))
index.extend((first, prev - first + 1))
data = zlib.compress(''.join(data))
dic = {'Type': LITERAL_XREF, 'Size': prev + 1, 'Index': index,
'W': [1, 2, 1], 'Length': len(data), 'Prev': startxref,
'Filter': LITERALS_FLATE_DECODE[0],}
obj = PDFStream(dic, data)
self.write('\n')
trailer['XRefStm'] = startxrefstm = self.tell()
self.serialize_indirect(maxobj + 1, obj)
trailer['Prev'] = startxref
startxref = self.tell()
self.write('xref\n')
self.write('%d 1\n' % (maxobj + 1,))
self.write("%010d %05d n \n" % (startxrefstm, 0))
self.write('trailer\n')
self.serialize_object(trailer)
self.write('\nstartxref\n%d\n%%%%EOF' % startxref) self.write('\nstartxref\n%d\n%%%%EOF' % startxref)
def write(self, *data): def write(self, data):
for datum in data: self.outf.write(data)
self.outf.write(datum) self.last = data[-1:]
self.last = data[-1][-1:]
def tell(self): def tell(self):
return self.outf.tell() return self.outf.tell()
@@ -1389,6 +1577,9 @@ class PDFSerializer(object):
string = string.replace('\n', r'\n') string = string.replace('\n', r'\n')
string = string.replace('(', r'\(') string = string.replace('(', r'\(')
string = string.replace(')', r'\)') string = string.replace(')', r'\)')
# get rid of ciando id
regularexp = re.compile(r'http://www.ciando.com/index.cfm/intRefererID/\d{5}')
if regularexp.match(string): return ('http://www.ciando.com')
return string return string
def serialize_object(self, obj): def serialize_object(self, obj):
@@ -1566,5 +1757,6 @@ def gui_main():
if __name__ == '__main__': if __name__ == '__main__':
# sys.exit(cli_main()) if len(sys.argv) > 1:
sys.exit(cli_main())
sys.exit(gui_main()) sys.exit(gui_main())