tools v3.0

First combined mobi/topaz kindle tool
This commit is contained in:
Apprentice Alf
2010-12-30 22:41:07 +00:00
parent 38eabe7612
commit a7856f5c32
148 changed files with 13779 additions and 8871 deletions

View File

@@ -0,0 +1,260 @@
#!/usr/bin/env python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
import sys
sys.path.append('lib')
import os, os.path, urllib
import Tkinter
import Tkconstants
import tkFileDialog
import tkMessageBox
from scrolltextwidget import ScrolledText
import subprocess
from subprocess import Popen, PIPE, STDOUT
import subasyncio
from subasyncio import Process
class MainDialog(Tkinter.Frame):
def __init__(self, root):
Tkinter.Frame.__init__(self, root, border=5)
self.root = root
self.interval = 1000
self.p2 = None
self.status = Tkinter.Label(self, text='Remove Encryption from a Kindle/Mobi/Topaz eBook')
self.status.pack(fill=Tkconstants.X, expand=1)
body = Tkinter.Frame(self)
body.pack(fill=Tkconstants.X, expand=1)
sticky = Tkconstants.E + Tkconstants.W
body.grid_columnconfigure(1, weight=2)
Tkinter.Label(body, text='Kindle/Mobi/Topaz eBook input file').grid(row=0, sticky=Tkconstants.E)
self.mobipath = Tkinter.Entry(body, width=50)
self.mobipath.grid(row=0, column=1, sticky=sticky)
cwd = os.getcwdu()
cwd = cwd.encode('utf-8')
self.mobipath.insert(0, cwd)
button = Tkinter.Button(body, text="...", command=self.get_mobipath)
button.grid(row=0, column=2)
Tkinter.Label(body, text='Directory for the Unencrypted Output File(s)').grid(row=1, sticky=Tkconstants.E)
self.outpath = Tkinter.Entry(body, width=50)
self.outpath.grid(row=1, column=1, sticky=sticky)
cwd = os.getcwdu()
cwd = cwd.encode('utf-8')
outname = cwd
self.outpath.insert(0, outname)
button = Tkinter.Button(body, text="...", command=self.get_outpath)
button.grid(row=1, column=2)
Tkinter.Label(body, text='Optional Alternative Kindle.info file').grid(row=2, sticky=Tkconstants.E)
self.altinfopath = Tkinter.Entry(body, width=50)
self.altinfopath.grid(row=2, column=1, sticky=sticky)
#cwd = os.getcwdu()
#cwd = cwd.encode('utf-8')
#self.altinfopath.insert(0, cwd)
button = Tkinter.Button(body, text="...", command=self.get_altinfopath)
button.grid(row=2, column=2)
Tkinter.Label(body, text='Optional Comma Separated List of 10 Character PIDs (no spaces)').grid(row=3, sticky=Tkconstants.E)
self.pidnums = Tkinter.StringVar()
self.pidinfo = Tkinter.Entry(body, width=50, textvariable=self.pidnums)
self.pidinfo.grid(row=3, column=1, sticky=sticky)
Tkinter.Label(body, text='Optional Comma Separated List of 16 Character Kindle Serial Numbers (no spaces)').grid(row=4, sticky=Tkconstants.E)
self.sernums = Tkinter.StringVar()
self.serinfo = Tkinter.Entry(body, width=50, textvariable=self.sernums)
self.serinfo.grid(row=4, column=1, sticky=sticky)
msg1 = 'Conversion Log \n\n'
self.stext = ScrolledText(body, bd=5, relief=Tkconstants.RIDGE, height=15, width=60, wrap=Tkconstants.WORD)
self.stext.grid(row=6, column=0, columnspan=2,sticky=sticky)
self.stext.insert(Tkconstants.END,msg1)
buttons = Tkinter.Frame(self)
buttons.pack()
self.sbotton = Tkinter.Button(
buttons, text="Start", width=10, command=self.convertit)
self.sbotton.pack(side=Tkconstants.LEFT)
Tkinter.Frame(buttons, width=10).pack(side=Tkconstants.LEFT)
self.qbutton = Tkinter.Button(
buttons, text="Quit", width=10, command=self.quitting)
self.qbutton.pack(side=Tkconstants.RIGHT)
# read from subprocess pipe without blocking
# invoked every interval via the widget "after"
# option being used, so need to reset it for the next time
def processPipe(self):
poll = self.p2.wait('nowait')
if poll != None:
text = self.p2.readerr()
text += self.p2.read()
msg = text + '\n\n' + 'Encryption successfully removed\n'
if poll == 1:
msg = text + '\n\n' + 'Error: Encryption Removal Failed\n'
if poll == 2:
msg = text + '\n\n' + 'Input File was Not Encrypted - No Output File Needed\n'
self.showCmdOutput(msg)
self.p2 = None
self.sbotton.configure(state='normal')
return
text = self.p2.readerr()
text += self.p2.read()
self.showCmdOutput(text)
# make sure we get invoked again by event loop after interval
self.stext.after(self.interval,self.processPipe)
return
# post output from subprocess in scrolled text widget
def showCmdOutput(self, msg):
if msg and msg !='':
msg = msg.encode('utf-8')
self.stext.insert(Tkconstants.END,msg)
self.stext.yview_pickplace(Tkconstants.END)
return
# run as a subprocess via pipes and collect stdout
def mobirdr(self, infile, outfile, altinfopath, pidnums, sernums):
# os.putenv('PYTHONUNBUFFERED', '1')
tool = 'k4mobidedrm.py'
pidoption = ''
if pidnums and pidnums != '':
pidoption = ' -p "' + pidnums + '" '
seroption = ''
if sernums and sernums != '':
seroption = ' -s "' + sernums + '" '
infooption = ''
if altinfopath and altinfopath != '':
infooption = ' -k "' + altinfopath + '" '
cmdline = 'python ./lib/' + tool + ' ' + pidoption + seroption + infooption + '"' + infile + '" "' + outfile + '"'
print cmdline
if sys.platform.startswith('win'):
search_path = os.environ['PATH']
search_path = search_path.lower()
if search_path.find('python') >= 0:
cmdline = 'python lib\\' + tool + ' ' + pidoption + seroption + infooption + '"' + infile + '" "' + outfile + '"'
else :
cmdline = 'lib\\' + tool + ' ' + pidoption + seroption + infooption + '"' + infile + '" "' + outfile + '"'
cmdline = cmdline.encode(sys.getfilesystemencoding())
p2 = Process(cmdline, shell=True, bufsize=1, stdin=None, stdout=PIPE, stderr=PIPE, close_fds=False)
return p2
def get_mobipath(self):
cpath = self.mobipath.get()
mobipath = tkFileDialog.askopenfilename(
initialdir = cpath,
parent=None, title='Select Kindle/Mobi/Topaz eBook File',
defaultextension='.prc', filetypes=[('Mobi eBook File', '.prc'), ('Mobi eBook File', '.azw'),('Mobi eBook File', '.mobi'),('Mobi eBook File', '.tpz'),('Mobi eBook File', '.azw1'),('All Files', '.*')])
if mobipath:
mobipath = os.path.normpath(mobipath)
self.mobipath.delete(0, Tkconstants.END)
self.mobipath.insert(0, mobipath)
return
def get_outpath(self):
cwd = os.getcwdu()
cwd = cwd.encode('utf-8')
outpath = tkFileDialog.askdirectory(
parent=None, title='Directory to Store Unencrypted file(s) into',
initialdir=cwd, initialfile=None)
if outpath:
outpath = os.path.normpath(outpath)
self.outpath.delete(0, Tkconstants.END)
self.outpath.insert(0, outpath)
return
def get_altinfopath(self):
cwd = os.getcwdu()
cwd = cwd.encode('utf-8')
altinfopath = tkFileDialog.askopenfilename(
parent=None, title='Select Alternative kindle.info File',
defaultextension='.prc', filetypes=[('Kindle Info', '.info'),
('All Files', '.*')],
initialdir=cwd)
if altinfopath:
altinfopath = os.path.normpath(altinfopath)
self.altinfopath.delete(0, Tkconstants.END)
self.altinfopath.insert(0, altinfopath)
return
def quitting(self):
# kill any still running subprocess
if self.p2 != None:
if (self.p2.wait('nowait') == None):
self.p2.terminate()
self.root.destroy()
# actually ready to run the subprocess and get its output
def convertit(self):
self.status['text'] = ''
# now disable the button to prevent multiple launches
self.sbotton.configure(state='disabled')
mobipath = self.mobipath.get()
outpath = self.outpath.get()
altinfopath = self.altinfopath.get()
pidnums = self.pidinfo.get()
sernums = self.serinfo.get()
if not mobipath or not os.path.exists(mobipath) or not os.path.isfile(mobipath):
self.status['text'] = 'Specified Kindle Mobi eBook file does not exist'
self.sbotton.configure(state='normal')
return
tpz = False
# Identify any Topaz Files
with open(mobipath, 'rb') as f:
raw = f.read(3)
if raw.startswith('TPZ'):
tpz = True
f.close()
if not outpath:
self.status['text'] = 'No output directory specified'
self.sbotton.configure(state='normal')
return
if not os.path.isdir(outpath):
self.status['text'] = 'Error specified output directory does not exist'
self.sbotton.configure(state='normal')
return
if altinfopath and not os.path.exists(altinfopath):
self.status['text'] = 'Specified kindle.info file does not exist'
self.sbotton.configure(state='normal')
return
log = 'Command = "python k4mobidedrm.py"\n'
if not tpz:
log += 'Kindle/Mobi Path = "'+ mobipath + '"\n'
else:
log += 'Topaz Path = "'+ mobipath + '"\n'
log += 'Output Directory = "' + outpath + '"\n'
log += 'Kindle.info file = "' + altinfopath + '"\n'
log += 'PID list = "' + pidnums + '"\n'
log += 'Serial Number list = "' + sernums + '"\n'
log += '\n\n'
log += 'Please Wait ...\n\n'
log = log.encode('utf-8')
self.stext.insert(Tkconstants.END,log)
self.p2 = self.mobirdr(mobipath, outpath, altinfopath, pidnums, sernums)
# python does not seem to allow you to create
# your own eventloop which every other gui does - strange
# so need to use the widget "after" command to force
# event loop to run non-gui events every interval
self.stext.after(self.interval,self.processPipe)
return
def main(argv=None):
root = Tkinter.Tk()
root.title('Kindle/Mobi/Topaz eBook Encryption Removal')
root.resizable(True, False)
root.minsize(300, 0)
MainDialog(root).pack(fill=Tkconstants.X, expand=1)
root.mainloop()
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,59 @@
KindleBooks (Originally called K4MobiDeDRM and Topaz_Tools)
This tools combines functionality of MobiDeDRM with that of K4PCDeDRM, K4MDeDRM, and K4DeDRM. Effectively, it provides one-stop shopping for all your Mobipocket, Kindle for iPhone/iPad/iPodTouch, Kindle for PC, and Kindle for Mac needs and should work for both Mobi and Topaz ebooks.
Preliminary Steps:
1. Make sure you have Python 2.X installed (32 bit) and properly set as part of your SYSTEM PATH environment variable (On Windows I recommend ActiveState's ActivePython. See their web pages for instructions on how to install and how to properly set your PATH). On Mac OSX 10.6 everything you need is already installed.
****
Please Note: If you a happy user of MobiDeDRM, K4DeDRM, K4PCDeDRM, or K4MUnswindle, please continue to use these programs as there is no additional capability provided by this tool over the others. In the long run, if you have problems with any of those tools, you might want to try this one as it will continue under development eventually replacing all of those tools.
****
Instructions:
1. double-click on KindleBooks.pyw
2. In the window that opens:
hit the first '...' button to locate your DRM Kindle-style ebook
3. Then hit the second '...' button to select an output directory for the unlocked file
4. If you have multiple Kindle.Info files and would like to use one specific one, please hit the third "...' button to select it. Note, if you only have one Kindle.Info file (like most users) this can and should be left blank.
5. Then add in any PIDs you need from KindleV1, Kindle for iPhone/iPad/iPodTouch, or other single PID devices to the provided box as a comma separated list of 10 digit PID numbers. If this is a Kindle for Mac or a Kindle for PC book then you can leave this box blank
6. If you have standalone Kindles, add in any 16 digit Serial Numbers as a comma separated list. If this is a Kindle for Mac or a Kindle for PC book then you can leave this box blank
7. hit the 'Start' button
After a short delay, you should see progress in the Conversion Log window indicating is the unlocking was a success or failure.
If your book was a normal Mobi style ebook:
If successful, you should see a "_nodrm" named version Mobi ebook.
If not please examine the Conversion Log window for any errors.
If your book was actually a Topaz book:
Please note that Topaz is most similar to a poor man's image only PDF in style. It has glyphs and x,y positions, ocrText used just for searching, that describe the image each page all encoded into a binary xml-like set of files.
If successful, you will have 3 zip archives created.
1. The first is BOOKNAME_nodrm.zip.
You can import this into calibre as is or unzip it and edit the book.html file you find inside. To create the book.html, Amazon's ocrText is combined with other information to recreate as closely as possible what the original book looked like. Unfortunately most bolding, italics is lost. Also, Amazon's ocrText can be absolutely horrible at times. Much work will be needed to clean up and correct Topaz books.
2. The second is BOOKNAME_SVG.zip
You can also import this into calibre or unzip it and open the indexsvg.xhtml file in any good Browser (Safari, Firefox, etc). This zip contains a set of svg images (one for each pages is created) and it shows the page exactly how it appeared. This zip can be used to create an image only pdf file via post conversion.
3. The third is BOOKNAME_XML.zip
This is a zip archive of the decrypted and translated xml-like descriptions of each page and can be archived/saved in case later code can do a better job converting these files. These are exactly what a Topaz books guts are. You should take a look at them in any text editor to see what they look like.
If the Topaz book conversion is not successful, a large _DEBUG.zip archive of all of the pieces is created and this can examined along with the Conversion Log window contents to determine the cause of the error and hopefully get it fixed in the next release.

View File

@@ -0,0 +1,900 @@
#! /usr/bin/python
"""
Comprehensive Mazama Book DRM with Topaz Cryptography V2.2
-----BEGIN PUBLIC KEY-----
MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDdBHJ4CNc6DNFCw4MRCw4SWAK6
M8hYfnNEI0yQmn5Ti+W8biT7EatpauE/5jgQMPBmdNrDr1hbHyHBSP7xeC2qlRWC
B62UCxeu/fpfnvNHDN/wPWWH4jynZ2M6cdcnE5LQ+FfeKqZn7gnG2No1U9h7oOHx
y2/pHuYme7U1TsgSjwIDAQAB
-----END PUBLIC KEY-----
"""
from __future__ import with_statement
import csv
import sys
import os
import getopt
import zlib
from struct import pack
from struct import unpack
from ctypes import windll, c_char_p, c_wchar_p, c_uint, POINTER, byref, \
create_unicode_buffer, create_string_buffer, CFUNCTYPE, addressof, \
string_at, Structure, c_void_p, cast
import _winreg as winreg
import Tkinter
import Tkconstants
import tkMessageBox
import traceback
import hashlib
MAX_PATH = 255
kernel32 = windll.kernel32
advapi32 = windll.advapi32
crypt32 = windll.crypt32
global kindleDatabase
global bookFile
global bookPayloadOffset
global bookHeaderRecords
global bookMetadata
global bookKey
global command
#
# Various character maps used to decrypt books. Probably supposed to act as obfuscation
#
charMap1 = "n5Pr6St7Uv8Wx9YzAb0Cd1Ef2Gh3Jk4M"
charMap2 = "AaZzB0bYyCc1XxDdW2wEeVv3FfUuG4g-TtHh5SsIiR6rJjQq7KkPpL8lOoMm9Nn_"
charMap3 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
charMap4 = "ABCDEFGHIJKLMNPQRSTUVWXYZ123456789"
#
# Exceptions for all the problems that might happen during the script
#
class CMBDTCError(Exception):
pass
class CMBDTCFatal(Exception):
pass
#
# Stolen stuff
#
class DataBlob(Structure):
_fields_ = [('cbData', c_uint),
('pbData', c_void_p)]
DataBlob_p = POINTER(DataBlob)
def GetSystemDirectory():
GetSystemDirectoryW = kernel32.GetSystemDirectoryW
GetSystemDirectoryW.argtypes = [c_wchar_p, c_uint]
GetSystemDirectoryW.restype = c_uint
def GetSystemDirectory():
buffer = create_unicode_buffer(MAX_PATH + 1)
GetSystemDirectoryW(buffer, len(buffer))
return buffer.value
return GetSystemDirectory
GetSystemDirectory = GetSystemDirectory()
def GetVolumeSerialNumber():
GetVolumeInformationW = kernel32.GetVolumeInformationW
GetVolumeInformationW.argtypes = [c_wchar_p, c_wchar_p, c_uint,
POINTER(c_uint), POINTER(c_uint),
POINTER(c_uint), c_wchar_p, c_uint]
GetVolumeInformationW.restype = c_uint
def GetVolumeSerialNumber(path):
vsn = c_uint(0)
GetVolumeInformationW(path, None, 0, byref(vsn), None, None, None, 0)
return vsn.value
return GetVolumeSerialNumber
GetVolumeSerialNumber = GetVolumeSerialNumber()
def GetUserName():
GetUserNameW = advapi32.GetUserNameW
GetUserNameW.argtypes = [c_wchar_p, POINTER(c_uint)]
GetUserNameW.restype = c_uint
def GetUserName():
buffer = create_unicode_buffer(32)
size = c_uint(len(buffer))
while not GetUserNameW(buffer, byref(size)):
buffer = create_unicode_buffer(len(buffer) * 2)
size.value = len(buffer)
return buffer.value.encode('utf-16-le')[::2]
return GetUserName
GetUserName = GetUserName()
def CryptUnprotectData():
_CryptUnprotectData = crypt32.CryptUnprotectData
_CryptUnprotectData.argtypes = [DataBlob_p, c_wchar_p, DataBlob_p,
c_void_p, c_void_p, c_uint, DataBlob_p]
_CryptUnprotectData.restype = c_uint
def CryptUnprotectData(indata, entropy):
indatab = create_string_buffer(indata)
indata = DataBlob(len(indata), cast(indatab, c_void_p))
entropyb = create_string_buffer(entropy)
entropy = DataBlob(len(entropy), cast(entropyb, c_void_p))
outdata = DataBlob()
if not _CryptUnprotectData(byref(indata), None, byref(entropy),
None, None, 0, byref(outdata)):
raise CMBDTCFatal("Failed to Unprotect Data")
return string_at(outdata.pbData, outdata.cbData)
return CryptUnprotectData
CryptUnprotectData = CryptUnprotectData()
#
# Returns the MD5 digest of "message"
#
def MD5(message):
ctx = hashlib.md5()
ctx.update(message)
return ctx.digest()
#
# Returns the MD5 digest of "message"
#
def SHA1(message):
ctx = hashlib.sha1()
ctx.update(message)
return ctx.digest()
#
# Open the book file at path
#
def openBook(path):
try:
return open(path,'rb')
except:
raise CMBDTCFatal("Could not open book file: " + path)
#
# Encode the bytes in data with the characters in map
#
def encode(data, map):
result = ""
for char in data:
value = ord(char)
Q = (value ^ 0x80) // len(map)
R = value % len(map)
result += map[Q]
result += map[R]
return result
#
# Hash the bytes in data and then encode the digest with the characters in map
#
def encodeHash(data,map):
return encode(MD5(data),map)
#
# Decode the string in data with the characters in map. Returns the decoded bytes
#
def decode(data,map):
result = ""
for i in range (0,len(data),2):
high = map.find(data[i])
low = map.find(data[i+1])
value = (((high * 0x40) ^ 0x80) & 0xFF) + low
result += pack("B",value)
return result
#
# Locate and open the Kindle.info file (Hopefully in the way it is done in the Kindle application)
#
def openKindleInfo():
regkey = winreg.OpenKey(winreg.HKEY_CURRENT_USER, "Software\\Microsoft\\Windows\\CurrentVersion\\Explorer\\Shell Folders\\")
path = winreg.QueryValueEx(regkey, 'Local AppData')[0]
return open(path+'\\Amazon\\Kindle For PC\\{AMAwzsaPaaZAzmZzZQzgZCAkZ3AjA_AY}\\kindle.info','r')
#
# Parse the Kindle.info file and return the records as a list of key-values
#
def parseKindleInfo():
DB = {}
infoReader = openKindleInfo()
infoReader.read(1)
data = infoReader.read()
items = data.split('{')
for item in items:
splito = item.split(':')
DB[splito[0]] =splito[1]
return DB
#
# Find if the original string for a hashed/encoded string is known. If so return the original string othwise return an empty string. (Totally not optimal)
#
def findNameForHash(hash):
names = ["kindle.account.tokens","kindle.cookie.item","eulaVersionAccepted","login_date","kindle.token.item","login","kindle.key.item","kindle.name.info","kindle.device.info", "MazamaRandomNumber"]
result = ""
for name in names:
if hash == encodeHash(name, charMap2):
result = name
break
return name
#
# Print all the records from the kindle.info file (option -i)
#
def printKindleInfo():
for record in kindleDatabase:
name = findNameForHash(record)
if name != "" :
print (name)
print ("--------------------------\n")
else :
print ("Unknown Record")
print getKindleInfoValueForHash(record)
print "\n"
#
# Get a record from the Kindle.info file for the key "hashedKey" (already hashed and encoded). Return the decoded and decrypted record
#
def getKindleInfoValueForHash(hashedKey):
global kindleDatabase
encryptedValue = decode(kindleDatabase[hashedKey],charMap2)
return CryptUnprotectData(encryptedValue,"")
#
# Get a record from the Kindle.info file for the string in "key" (plaintext). Return the decoded and decrypted record
#
def getKindleInfoValueForKey(key):
return getKindleInfoValueForHash(encodeHash(key,charMap2))
#
# Get a 7 bit encoded number from the book file
#
def bookReadEncodedNumber():
flag = False
data = ord(bookFile.read(1))
if data == 0xFF:
flag = True
data = ord(bookFile.read(1))
if data >= 0x80:
datax = (data & 0x7F)
while data >= 0x80 :
data = ord(bookFile.read(1))
datax = (datax <<7) + (data & 0x7F)
data = datax
if flag:
data = -data
return data
#
# Encode a number in 7 bit format
#
def encodeNumber(number):
result = ""
negative = False
flag = 0
if number < 0 :
number = -number + 1
negative = True
while True:
byte = number & 0x7F
number = number >> 7
byte += flag
result += chr(byte)
flag = 0x80
if number == 0 :
if (byte == 0xFF and negative == False) :
result += chr(0x80)
break
if negative:
result += chr(0xFF)
return result[::-1]
#
# Get a length prefixed string from the file
#
def bookReadString():
stringLength = bookReadEncodedNumber()
return unpack(str(stringLength)+"s",bookFile.read(stringLength))[0]
#
# Returns a length prefixed string
#
def lengthPrefixString(data):
return encodeNumber(len(data))+data
#
# Read and return the data of one header record at the current book file position [[offset,compressedLength,decompressedLength],...]
#
def bookReadHeaderRecordData():
nbValues = bookReadEncodedNumber()
values = []
for i in range (0,nbValues):
values.append([bookReadEncodedNumber(),bookReadEncodedNumber(),bookReadEncodedNumber()])
return values
#
# Read and parse one header record at the current book file position and return the associated data [[offset,compressedLength,decompressedLength],...]
#
def parseTopazHeaderRecord():
if ord(bookFile.read(1)) != 0x63:
raise CMBDTCFatal("Parse Error : Invalid Header")
tag = bookReadString()
record = bookReadHeaderRecordData()
return [tag,record]
#
# Parse the header of a Topaz file, get all the header records and the offset for the payload
#
def parseTopazHeader():
global bookHeaderRecords
global bookPayloadOffset
magic = unpack("4s",bookFile.read(4))[0]
if magic != 'TPZ0':
raise CMBDTCFatal("Parse Error : Invalid Header, not a Topaz file")
nbRecords = bookReadEncodedNumber()
bookHeaderRecords = {}
for i in range (0,nbRecords):
result = parseTopazHeaderRecord()
bookHeaderRecords[result[0]] = result[1]
if ord(bookFile.read(1)) != 0x64 :
raise CMBDTCFatal("Parse Error : Invalid Header")
bookPayloadOffset = bookFile.tell()
#
# Get a record in the book payload, given its name and index. If necessary the record is decrypted. The record is not decompressed
#
def getBookPayloadRecord(name, index):
encrypted = False
try:
recordOffset = bookHeaderRecords[name][index][0]
except:
raise CMBDTCFatal("Parse Error : Invalid Record, record not found")
bookFile.seek(bookPayloadOffset + recordOffset)
tag = bookReadString()
if tag != name :
raise CMBDTCFatal("Parse Error : Invalid Record, record name doesn't match")
recordIndex = bookReadEncodedNumber()
if recordIndex < 0 :
encrypted = True
recordIndex = -recordIndex -1
if recordIndex != index :
raise CMBDTCFatal("Parse Error : Invalid Record, index doesn't match")
if bookHeaderRecords[name][index][2] != 0 :
record = bookFile.read(bookHeaderRecords[name][index][2])
else:
record = bookFile.read(bookHeaderRecords[name][index][1])
if encrypted:
ctx = topazCryptoInit(bookKey)
record = topazCryptoDecrypt(record,ctx)
return record
#
# Extract, decrypt and decompress a book record indicated by name and index and print it or save it in "filename"
#
def extractBookPayloadRecord(name, index, filename):
compressed = False
try:
compressed = bookHeaderRecords[name][index][2] != 0
record = getBookPayloadRecord(name,index)
except:
print("Could not find record")
if compressed:
try:
record = zlib.decompress(record)
except:
raise CMBDTCFatal("Could not decompress record")
if filename != "":
try:
file = open(filename,"wb")
file.write(record)
file.close()
except:
raise CMBDTCFatal("Could not write to destination file")
else:
print(record)
#
# return next record [key,value] from the book metadata from the current book position
#
def readMetadataRecord():
return [bookReadString(),bookReadString()]
#
# Parse the metadata record from the book payload and return a list of [key,values]
#
def parseMetadata():
global bookHeaderRecords
global bookPayloadAddress
global bookMetadata
bookMetadata = {}
bookFile.seek(bookPayloadOffset + bookHeaderRecords["metadata"][0][0])
tag = bookReadString()
if tag != "metadata" :
raise CMBDTCFatal("Parse Error : Record Names Don't Match")
flags = ord(bookFile.read(1))
nbRecords = ord(bookFile.read(1))
for i in range (0,nbRecords) :
record =readMetadataRecord()
bookMetadata[record[0]] = record[1]
#
# Returns two bit at offset from a bit field
#
def getTwoBitsFromBitField(bitField,offset):
byteNumber = offset // 4
bitPosition = 6 - 2*(offset % 4)
return ord(bitField[byteNumber]) >> bitPosition & 3
#
# Returns the six bits at offset from a bit field
#
def getSixBitsFromBitField(bitField,offset):
offset *= 3
value = (getTwoBitsFromBitField(bitField,offset) <<4) + (getTwoBitsFromBitField(bitField,offset+1) << 2) +getTwoBitsFromBitField(bitField,offset+2)
return value
#
# 8 bits to six bits encoding from hash to generate PID string
#
def encodePID(hash):
global charMap3
PID = ""
for position in range (0,8):
PID += charMap3[getSixBitsFromBitField(hash,position)]
return PID
#
# Context initialisation for the Topaz Crypto
#
def topazCryptoInit(key):
ctx1 = 0x0CAFFE19E
for keyChar in key:
keyByte = ord(keyChar)
ctx2 = ctx1
ctx1 = ((((ctx1 >>2) * (ctx1 >>7))&0xFFFFFFFF) ^ (keyByte * keyByte * 0x0F902007)& 0xFFFFFFFF )
return [ctx1,ctx2]
#
# decrypt data with the context prepared by topazCryptoInit()
#
def topazCryptoDecrypt(data, ctx):
ctx1 = ctx[0]
ctx2 = ctx[1]
plainText = ""
for dataChar in data:
dataByte = ord(dataChar)
m = (dataByte ^ ((ctx1 >> 3) &0xFF) ^ ((ctx2<<3) & 0xFF)) &0xFF
ctx2 = ctx1
ctx1 = (((ctx1 >> 2) * (ctx1 >> 7)) &0xFFFFFFFF) ^((m * m * 0x0F902007) &0xFFFFFFFF)
plainText += chr(m)
return plainText
#
# Decrypt a payload record with the PID
#
def decryptRecord(data,PID):
ctx = topazCryptoInit(PID)
return topazCryptoDecrypt(data, ctx)
#
# Try to decrypt a dkey record (contains the book PID)
#
def decryptDkeyRecord(data,PID):
record = decryptRecord(data,PID)
fields = unpack("3sB8sB8s3s",record)
if fields[0] != "PID" or fields[5] != "pid" :
raise CMBDTCError("Didn't find PID magic numbers in record")
elif fields[1] != 8 or fields[3] != 8 :
raise CMBDTCError("Record didn't contain correct length fields")
elif fields[2] != PID :
raise CMBDTCError("Record didn't contain PID")
return fields[4]
#
# Decrypt all the book's dkey records (contain the book PID)
#
def decryptDkeyRecords(data,PID):
nbKeyRecords = ord(data[0])
records = []
data = data[1:]
for i in range (0,nbKeyRecords):
length = ord(data[0])
try:
key = decryptDkeyRecord(data[1:length+1],PID)
records.append(key)
except CMBDTCError:
pass
data = data[1+length:]
return records
#
# Encryption table used to generate the device PID
#
def generatePidEncryptionTable() :
table = []
for counter1 in range (0,0x100):
value = counter1
for counter2 in range (0,8):
if (value & 1 == 0) :
value = value >> 1
else :
value = value >> 1
value = value ^ 0xEDB88320
table.append(value)
return table
#
# Seed value used to generate the device PID
#
def generatePidSeed(table,dsn) :
value = 0
for counter in range (0,4) :
index = (ord(dsn[counter]) ^ value) &0xFF
value = (value >> 8) ^ table[index]
return value
#
# Generate the device PID
#
def generateDevicePID(table,dsn,nbRoll):
seed = generatePidSeed(table,dsn)
pidAscii = ""
pid = [(seed >>24) &0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF,(seed>>24) & 0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF]
index = 0
for counter in range (0,nbRoll):
pid[index] = pid[index] ^ ord(dsn[counter])
index = (index+1) %8
for counter in range (0,8):
index = ((((pid[counter] >>5) & 3) ^ pid[counter]) & 0x1f) + (pid[counter] >> 7)
pidAscii += charMap4[index]
return pidAscii
#
# Create decrypted book payload
#
def createDecryptedPayload(payload):
# store data to be able to create the header later
headerData= []
currentOffset = 0
# Add social DRM to decrypted files
try:
data = getKindleInfoValueForKey("kindle.name.info")+":"+ getKindleInfoValueForKey("login")
if payload!= None:
payload.write(lengthPrefixString("sdrm"))
payload.write(encodeNumber(0))
payload.write(data)
else:
currentOffset += len(lengthPrefixString("sdrm"))
currentOffset += len(encodeNumber(0))
currentOffset += len(data)
except:
pass
for headerRecord in bookHeaderRecords:
name = headerRecord
newRecord = []
if name != "dkey" :
for index in range (0,len(bookHeaderRecords[name])) :
offset = currentOffset
if payload != None:
# write tag
payload.write(lengthPrefixString(name))
# write data
payload.write(encodeNumber(index))
payload.write(getBookPayloadRecord(name, index))
else :
currentOffset += len(lengthPrefixString(name))
currentOffset += len(encodeNumber(index))
currentOffset += len(getBookPayloadRecord(name, index))
newRecord.append([offset,bookHeaderRecords[name][index][1],bookHeaderRecords[name][index][2]])
headerData.append([name,newRecord])
return headerData
#
# Create decrypted book
#
def createDecryptedBook(outputFile):
outputFile = open(outputFile,"wb")
# Write the payload in a temporary file
headerData = createDecryptedPayload(None)
outputFile.write("TPZ0")
outputFile.write(encodeNumber(len(headerData)))
for header in headerData :
outputFile.write(chr(0x63))
outputFile.write(lengthPrefixString(header[0]))
outputFile.write(encodeNumber(len(header[1])))
for numbers in header[1] :
outputFile.write(encodeNumber(numbers[0]))
outputFile.write(encodeNumber(numbers[1]))
outputFile.write(encodeNumber(numbers[2]))
outputFile.write(chr(0x64))
createDecryptedPayload(outputFile)
outputFile.close()
#
# Set the command to execute by the programm according to cmdLine parameters
#
def setCommand(name) :
global command
if command != "" :
raise CMBDTCFatal("Invalid command line parameters")
else :
command = name
#
# Program usage
#
def usage():
print("\nUsage:")
print("\nCMBDTC.py [options] bookFileName\n")
print("-p Adds a PID to the list of PIDs that are tried to decrypt the book key (can be used several times)")
print("-d Saves a decrypted copy of the book")
print("-r Prints or writes to disk a record indicated in the form name:index (e.g \"img:0\")")
print("-o Output file name to write records and decrypted books")
print("-v Verbose (can be used several times)")
print("-i Prints kindle.info database")
#
# Main
#
def main(argv=sys.argv):
global kindleDatabase
global bookMetadata
global bookKey
global bookFile
global command
progname = os.path.basename(argv[0])
verbose = 0
recordName = ""
recordIndex = 0
outputFile = ""
PIDs = []
kindleDatabase = None
command = ""
try:
opts, args = getopt.getopt(sys.argv[1:], "vdir:o:p:")
except getopt.GetoptError, err:
# print help information and exit:
print str(err) # will print something like "option -a not recognized"
usage()
sys.exit(2)
if len(opts) == 0 and len(args) == 0 :
usage()
sys.exit(2)
for o, a in opts:
if o == "-v":
verbose+=1
if o == "-i":
setCommand("printInfo")
if o =="-o":
if a == None :
raise CMBDTCFatal("Invalid parameter for -o")
outputFile = a
if o =="-r":
setCommand("printRecord")
try:
recordName,recordIndex = a.split(':')
except:
raise CMBDTCFatal("Invalid parameter for -r")
if o =="-p":
PIDs.append(a)
if o =="-d":
setCommand("doit")
if command == "" :
raise CMBDTCFatal("No action supplied on command line")
#
# Read the encrypted database
#
try:
kindleDatabase = parseKindleInfo()
except Exception, message:
if verbose>0:
print(message)
if kindleDatabase != None :
if command == "printInfo" :
printKindleInfo()
#
# Compute the DSN
#
# Get the Mazama Random number
MazamaRandomNumber = getKindleInfoValueForKey("MazamaRandomNumber")
# Get the HDD serial
encodedSystemVolumeSerialNumber = encodeHash(str(GetVolumeSerialNumber(GetSystemDirectory().split('\\')[0] + '\\')),charMap1)
# Get the current user name
encodedUsername = encodeHash(GetUserName(),charMap1)
# concat, hash and encode
DSN = encode(SHA1(MazamaRandomNumber+encodedSystemVolumeSerialNumber+encodedUsername),charMap1)
if verbose >1:
print("DSN: " + DSN)
#
# Compute the device PID
#
table = generatePidEncryptionTable()
devicePID = generateDevicePID(table,DSN,4)
PIDs.append(devicePID)
if verbose > 0:
print("Device PID: " + devicePID)
#
# Open book and parse metadata
#
if len(args) == 1:
bookFile = openBook(args[0])
parseTopazHeader()
parseMetadata()
#
# Compute book PID
#
# Get the account token
if kindleDatabase != None:
kindleAccountToken = getKindleInfoValueForKey("kindle.account.tokens")
if verbose >1:
print("Account Token: " + kindleAccountToken)
keysRecord = bookMetadata["keys"]
keysRecordRecord = bookMetadata[keysRecord]
pidHash = SHA1(DSN+kindleAccountToken+keysRecord+keysRecordRecord)
bookPID = encodePID(pidHash)
PIDs.append(bookPID)
if verbose > 0:
print ("Book PID: " + bookPID )
#
# Decrypt book key
#
dkey = getBookPayloadRecord('dkey', 0)
bookKeys = []
for PID in PIDs :
bookKeys+=decryptDkeyRecords(dkey,PID)
if len(bookKeys) == 0 :
if verbose > 0 :
print ("Book key could not be found. Maybe this book is not registered with this device.")
else :
bookKey = bookKeys[0]
if verbose > 0:
print("Book key: " + bookKey.encode('hex'))
if command == "printRecord" :
extractBookPayloadRecord(recordName,int(recordIndex),outputFile)
if outputFile != "" and verbose>0 :
print("Wrote record to file: "+outputFile)
elif command == "doit" :
if outputFile!="" :
createDecryptedBook(outputFile)
if verbose >0 :
print ("Decrypted book saved. Don't pirate!")
elif verbose > 0:
print("Output file name was not supplied.")
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@@ -0,0 +1,817 @@
#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 2.6
class Unbuffered:
def __init__(self, stream):
self.stream = stream
def write(self, data):
self.stream.write(data)
self.stream.flush()
def __getattr__(self, attr):
return getattr(self.stream, attr)
import sys
sys.stdout=Unbuffered(sys.stdout)
import csv
import os
import getopt
from struct import pack
from struct import unpack
# Get a 7 bit encoded number from string. The most
# significant byte comes first and has the high bit (8th) set
def readEncodedNumber(file):
flag = False
c = file.read(1)
if (len(c) == 0):
return None
data = ord(c)
if data == 0xFF:
flag = True
c = file.read(1)
if (len(c) == 0):
return None
data = ord(c)
if data >= 0x80:
datax = (data & 0x7F)
while data >= 0x80 :
c = file.read(1)
if (len(c) == 0):
return None
data = ord(c)
datax = (datax <<7) + (data & 0x7F)
data = datax
if flag:
data = -data
return data
# returns a binary string that encodes a number into 7 bits
# most significant byte first which has the high bit set
def encodeNumber(number):
result = ""
negative = False
flag = 0
if number < 0 :
number = -number + 1
negative = True
while True:
byte = number & 0x7F
number = number >> 7
byte += flag
result += chr(byte)
flag = 0x80
if number == 0 :
if (byte == 0xFF and negative == False) :
result += chr(0x80)
break
if negative:
result += chr(0xFF)
return result[::-1]
# create / read a length prefixed string from the file
def lengthPrefixString(data):
return encodeNumber(len(data))+data
def readString(file):
stringLength = readEncodedNumber(file)
if (stringLength == None):
return ""
sv = file.read(stringLength)
if (len(sv) != stringLength):
return ""
return unpack(str(stringLength)+"s",sv)[0]
# convert a binary string generated by encodeNumber (7 bit encoded number)
# to the value you would find inside the page*.dat files to be processed
def convert(i):
result = ''
val = encodeNumber(i)
for j in xrange(len(val)):
c = ord(val[j:j+1])
result += '%02x' % c
return result
# the complete string table used to store all book text content
# as well as the xml tokens and values that make sense out of it
class Dictionary(object):
def __init__(self, dictFile):
self.filename = dictFile
self.size = 0
self.fo = file(dictFile,'rb')
self.stable = []
self.size = readEncodedNumber(self.fo)
for i in xrange(self.size):
self.stable.append(self.escapestr(readString(self.fo)))
self.pos = 0
def escapestr(self, str):
str = str.replace('&','&amp;')
str = str.replace('<','&lt;')
str = str.replace('>','&gt;')
str = str.replace('=','&#61;')
return str
def lookup(self,val):
if ((val >= 0) and (val < self.size)) :
self.pos = val
return self.stable[self.pos]
else:
print "Error - %d outside of string table limits" % val
sys.exit(-1)
def getSize(self):
return self.size
def getPos(self):
return self.pos
def dumpDict(self):
for i in xrange(self.size):
print "%d %s %s" % (i, convert(i), self.stable[i])
return
# parses the xml snippets that are represented by each page*.dat file.
# also parses the other0.dat file - the main stylesheet
# and information used to inject the xml snippets into page*.dat files
class PageParser(object):
def __init__(self, filename, dict, debug, flat_xml):
self.fo = file(filename,'rb')
self.id = os.path.basename(filename).replace('.dat','')
self.dict = dict
self.debug = debug
self.flat_xml = flat_xml
self.tagpath = []
self.doc = []
self.snippetList = []
# hash table used to enable the decoding process
# This has all been developed by trial and error so it may still have omissions or
# contain errors
# Format:
# tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped)
token_tags = {
'x' : (1, 'scalar_number', 0, 0),
'y' : (1, 'scalar_number', 0, 0),
'h' : (1, 'scalar_number', 0, 0),
'w' : (1, 'scalar_number', 0, 0),
'firstWord' : (1, 'scalar_number', 0, 0),
'lastWord' : (1, 'scalar_number', 0, 0),
'rootID' : (1, 'scalar_number', 0, 0),
'stemID' : (1, 'scalar_number', 0, 0),
'type' : (1, 'scalar_text', 0, 0),
'info' : (0, 'number', 1, 0),
'info.word' : (0, 'number', 1, 1),
'info.word.ocrText' : (1, 'text', 0, 0),
'info.word.firstGlyph' : (1, 'raw', 0, 0),
'info.word.lastGlyph' : (1, 'raw', 0, 0),
'info.word.bl' : (1, 'raw', 0, 0),
'info.word.link_id' : (1, 'number', 0, 0),
'glyph' : (0, 'number', 1, 1),
'glyph.x' : (1, 'number', 0, 0),
'glyph.y' : (1, 'number', 0, 0),
'glyph.glyphID' : (1, 'number', 0, 0),
'dehyphen' : (0, 'number', 1, 1),
'dehyphen.rootID' : (1, 'number', 0, 0),
'dehyphen.stemID' : (1, 'number', 0, 0),
'dehyphen.stemPage' : (1, 'number', 0, 0),
'dehyphen.sh' : (1, 'number', 0, 0),
'links' : (0, 'number', 1, 1),
'links.page' : (1, 'number', 0, 0),
'links.rel' : (1, 'number', 0, 0),
'links.row' : (1, 'number', 0, 0),
'links.title' : (1, 'text', 0, 0),
'links.href' : (1, 'text', 0, 0),
'links.type' : (1, 'text', 0, 0),
'paraCont' : (0, 'number', 1, 1),
'paraCont.rootID' : (1, 'number', 0, 0),
'paraCont.stemID' : (1, 'number', 0, 0),
'paraCont.stemPage' : (1, 'number', 0, 0),
'paraStems' : (0, 'number', 1, 1),
'paraStems.stemID' : (1, 'number', 0, 0),
'wordStems' : (0, 'number', 1, 1),
'wordStems.stemID' : (1, 'number', 0, 0),
'empty' : (1, 'snippets', 1, 0),
'page' : (1, 'snippets', 1, 0),
'page.pageid' : (1, 'scalar_text', 0, 0),
'page.pagelabel' : (1, 'scalar_text', 0, 0),
'page.type' : (1, 'scalar_text', 0, 0),
'page.h' : (1, 'scalar_number', 0, 0),
'page.w' : (1, 'scalar_number', 0, 0),
'page.startID' : (1, 'scalar_number', 0, 0),
'group' : (1, 'snippets', 1, 0),
'group.type' : (1, 'scalar_text', 0, 0),
'region' : (1, 'snippets', 1, 0),
'region.type' : (1, 'scalar_text', 0, 0),
'region.x' : (1, 'scalar_number', 0, 0),
'region.y' : (1, 'scalar_number', 0, 0),
'region.h' : (1, 'scalar_number', 0, 0),
'region.w' : (1, 'scalar_number', 0, 0),
'empty_text_region' : (1, 'snippets', 1, 0),
'img' : (1, 'snippets', 1, 0),
'img.x' : (1, 'scalar_number', 0, 0),
'img.y' : (1, 'scalar_number', 0, 0),
'img.h' : (1, 'scalar_number', 0, 0),
'img.w' : (1, 'scalar_number', 0, 0),
'img.src' : (1, 'scalar_number', 0, 0),
'img.color_src' : (1, 'scalar_number', 0, 0),
'paragraph' : (1, 'snippets', 1, 0),
'paragraph.class' : (1, 'scalar_text', 0, 0),
'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
'word_semantic' : (1, 'snippets', 1, 1),
'word_semantic.type' : (1, 'scalar_text', 0, 0),
'word_semantic.firstWord' : (1, 'scalar_number', 0, 0),
'word_semantic.lastWord' : (1, 'scalar_number', 0, 0),
'word' : (1, 'snippets', 1, 0),
'word.type' : (1, 'scalar_text', 0, 0),
'word.class' : (1, 'scalar_text', 0, 0),
'word.firstGlyph' : (1, 'scalar_number', 0, 0),
'word.lastGlyph' : (1, 'scalar_number', 0, 0),
'_span' : (1, 'snippets', 1, 0),
'_span.firstWord' : (1, 'scalar_number', 0, 0),
'-span.lastWord' : (1, 'scalar_number', 0, 0),
'span' : (1, 'snippets', 1, 0),
'span.firstWord' : (1, 'scalar_number', 0, 0),
'span.lastWord' : (1, 'scalar_number', 0, 0),
'extratokens' : (1, 'snippets', 1, 0),
'extratokens.type' : (1, 'scalar_text', 0, 0),
'extratokens.firstGlyph' : (1, 'scalar_number', 0, 0),
'extratokens.lastGlyph' : (1, 'scalar_number', 0, 0),
'glyph.h' : (1, 'number', 0, 0),
'glyph.w' : (1, 'number', 0, 0),
'glyph.use' : (1, 'number', 0, 0),
'glyph.vtx' : (1, 'number', 0, 1),
'glyph.len' : (1, 'number', 0, 1),
'glyph.dpi' : (1, 'number', 0, 0),
'vtx' : (0, 'number', 1, 1),
'vtx.x' : (1, 'number', 0, 0),
'vtx.y' : (1, 'number', 0, 0),
'len' : (0, 'number', 1, 1),
'len.n' : (1, 'number', 0, 0),
'book' : (1, 'snippets', 1, 0),
'version' : (1, 'snippets', 1, 0),
'version.FlowEdit_1_id' : (1, 'scalar_text', 0, 0),
'version.FlowEdit_1_version' : (1, 'scalar_text', 0, 0),
'version.Schema_id' : (1, 'scalar_text', 0, 0),
'version.Schema_version' : (1, 'scalar_text', 0, 0),
'version.Topaz_version' : (1, 'scalar_text', 0, 0),
'version.WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0),
'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0),
'version.ZoneEdit_1_id' : (1, 'scalar_text', 0, 0),
'version.ZoneEdit_1_version' : (1, 'scalar_text', 0, 0),
'version.chapterheaders' : (1, 'scalar_text', 0, 0),
'version.creation_date' : (1, 'scalar_text', 0, 0),
'version.header_footer' : (1, 'scalar_text', 0, 0),
'version.init_from_ocr' : (1, 'scalar_text', 0, 0),
'version.letter_insertion' : (1, 'scalar_text', 0, 0),
'version.xmlinj_convert' : (1, 'scalar_text', 0, 0),
'version.xmlinj_reflow' : (1, 'scalar_text', 0, 0),
'version.xmlinj_transform' : (1, 'scalar_text', 0, 0),
'version.findlists' : (1, 'scalar_text', 0, 0),
'version.page_num' : (1, 'scalar_text', 0, 0),
'version.page_type' : (1, 'scalar_text', 0, 0),
'version.bad_text' : (1, 'scalar_text', 0, 0),
'version.glyph_mismatch' : (1, 'scalar_text', 0, 0),
'version.margins' : (1, 'scalar_text', 0, 0),
'version.staggered_lines' : (1, 'scalar_text', 0, 0),
'version.paragraph_continuation' : (1, 'scalar_text', 0, 0),
'version.toc' : (1, 'scalar_text', 0, 0),
'stylesheet' : (1, 'snippets', 1, 0),
'style' : (1, 'snippets', 1, 0),
'style._tag' : (1, 'scalar_text', 0, 0),
'style.type' : (1, 'scalar_text', 0, 0),
'style._parent_type' : (1, 'scalar_text', 0, 0),
'style.class' : (1, 'scalar_text', 0, 0),
'style._after_class' : (1, 'scalar_text', 0, 0),
'rule' : (1, 'snippets', 1, 0),
'rule.attr' : (1, 'scalar_text', 0, 0),
'rule.value' : (1, 'scalar_text', 0, 0),
'original' : (0, 'number', 1, 1),
'original.pnum' : (1, 'number', 0, 0),
'original.pid' : (1, 'text', 0, 0),
'pages' : (0, 'number', 1, 1),
'pages.ref' : (1, 'number', 0, 0),
'pages.id' : (1, 'number', 0, 0),
'startID' : (0, 'number', 1, 1),
'startID.page' : (1, 'number', 0, 0),
'startID.id' : (1, 'number', 0, 0),
}
# full tag path record keeping routines
def tag_push(self, token):
self.tagpath.append(token)
def tag_pop(self):
if len(self.tagpath) > 0 :
self.tagpath.pop()
def tagpath_len(self):
return len(self.tagpath)
def get_tagpath(self, i):
cnt = len(self.tagpath)
if i < cnt : result = self.tagpath[i]
for j in xrange(i+1, cnt) :
result += '.' + self.tagpath[j]
return result
# list of absolute command byte values values that indicate
# various types of loop meachanisms typically used to generate vectors
cmd_list = (0x76, 0x76)
# peek at and return 1 byte that is ahead by i bytes
def peek(self, aheadi):
c = self.fo.read(aheadi)
if (len(c) == 0):
return None
self.fo.seek(-aheadi,1)
c = c[-1:]
return ord(c)
# get the next value from the file being processed
def getNext(self):
nbyte = self.peek(1);
if (nbyte == None):
return None
val = readEncodedNumber(self.fo)
return val
# format an arg by argtype
def formatArg(self, arg, argtype):
if (argtype == 'text') or (argtype == 'scalar_text') :
result = self.dict.lookup(arg)
elif (argtype == 'raw') or (argtype == 'number') or (argtype == 'scalar_number') :
result = arg
elif (argtype == 'snippets') :
result = arg
else :
print "Error Unknown argtype %s" % argtype
sys.exit(-2)
return result
# process the next tag token, recursively handling subtags,
# arguments, and commands
def procToken(self, token):
known_token = False
self.tag_push(token)
if self.debug : print 'Processing: ', self.get_tagpath(0)
cnt = self.tagpath_len()
for j in xrange(cnt):
tkn = self.get_tagpath(j)
if tkn in self.token_tags :
num_args = self.token_tags[tkn][0]
argtype = self.token_tags[tkn][1]
subtags = self.token_tags[tkn][2]
splcase = self.token_tags[tkn][3]
ntags = -1
known_token = True
break
if known_token :
# handle subtags if present
subtagres = []
if (splcase == 1):
# this type of tag uses of escape marker 0x74 indicate subtag count
if self.peek(1) == 0x74:
skip = readEncodedNumber(self.fo)
subtags = 1
num_args = 0
if (subtags == 1):
ntags = readEncodedNumber(self.fo)
if self.debug : print 'subtags: ' + token + ' has ' + str(ntags)
for j in xrange(ntags):
val = readEncodedNumber(self.fo)
subtagres.append(self.procToken(self.dict.lookup(val)))
# arguments can be scalars or vectors of text or numbers
argres = []
if num_args > 0 :
firstarg = self.peek(1)
if (firstarg in self.cmd_list) and (argtype != 'scalar_number') and (argtype != 'scalar_text'):
# single argument is a variable length vector of data
arg = readEncodedNumber(self.fo)
argres = self.decodeCMD(arg,argtype)
else :
# num_arg scalar arguments
for i in xrange(num_args):
argres.append(self.formatArg(readEncodedNumber(self.fo), argtype))
# build the return tag
result = []
tkn = self.get_tagpath(0)
result.append(tkn)
result.append(subtagres)
result.append(argtype)
result.append(argres)
self.tag_pop()
return result
# all tokens that need to be processed should be in the hash
# table if it may indicate a problem, either new token
# or an out of sync condition
else:
result = []
if (self.debug):
print 'Unknown Token:', token
self.tag_pop()
return result
# special loop used to process code snippets
# it is NEVER used to format arguments.
# builds the snippetList
def doLoop72(self, argtype):
cnt = readEncodedNumber(self.fo)
if self.debug :
result = 'Set of '+ str(cnt) + ' xml snippets. The overall structure \n'
result += 'of the document is indicated by snippet number sets at the\n'
result += 'end of each snippet. \n'
print result
for i in xrange(cnt):
if self.debug: print 'Snippet:',str(i)
snippet = []
snippet.append(i)
val = readEncodedNumber(self.fo)
snippet.append(self.procToken(self.dict.lookup(val)))
self.snippetList.append(snippet)
return
# general loop code gracisouly submitted by "skindle" - thank you!
def doLoop76Mode(self, argtype, cnt, mode):
result = []
adj = 0
if mode & 1:
adj = readEncodedNumber(self.fo)
mode = mode >> 1
x = []
for i in xrange(cnt):
x.append(readEncodedNumber(self.fo) - adj)
for i in xrange(mode):
for j in xrange(1, cnt):
x[j] = x[j] + x[j - 1]
for i in xrange(cnt):
result.append(self.formatArg(x[i],argtype))
return result
# dispatches loop commands bytes with various modes
# The 0x76 style loops are used to build vectors
# This was all derived by trial and error and
# new loop types may exist that are not handled here
# since they did not appear in the test cases
def decodeCMD(self, cmd, argtype):
if (cmd == 0x76):
# loop with cnt, and mode to control loop styles
cnt = readEncodedNumber(self.fo)
mode = readEncodedNumber(self.fo)
if self.debug : print 'Loop for', cnt, 'with mode', mode, ': '
return self.doLoop76Mode(argtype, cnt, mode)
if self.dbug: print "Unknown command", cmd
result = []
return result
# add full tag path to injected snippets
def updateName(self, tag, prefix):
name = tag[0]
subtagList = tag[1]
argtype = tag[2]
argList = tag[3]
nname = prefix + '.' + name
nsubtaglist = []
for j in subtagList:
nsubtaglist.append(self.updateName(j,prefix))
ntag = []
ntag.append(nname)
ntag.append(nsubtaglist)
ntag.append(argtype)
ntag.append(argList)
return ntag
# perform depth first injection of specified snippets into this one
def injectSnippets(self, snippet):
snipno, tag = snippet
name = tag[0]
subtagList = tag[1]
argtype = tag[2]
argList = tag[3]
nsubtagList = []
if len(argList) > 0 :
for j in argList:
asnip = self.snippetList[j]
aso, atag = self.injectSnippets(asnip)
atag = self.updateName(atag, name)
nsubtagList.append(atag)
argtype='number'
argList=[]
if len(nsubtagList) > 0 :
subtagList.extend(nsubtagList)
tag = []
tag.append(name)
tag.append(subtagList)
tag.append(argtype)
tag.append(argList)
snippet = []
snippet.append(snipno)
snippet.append(tag)
return snippet
# format the tag for output
def formatTag(self, node):
name = node[0]
subtagList = node[1]
argtype = node[2]
argList = node[3]
fullpathname = name.split('.')
nodename = fullpathname.pop()
ilvl = len(fullpathname)
indent = ' ' * (3 * ilvl)
result = indent + '<' + nodename + '>'
if len(argList) > 0:
argres = ''
for j in argList:
if (argtype == 'text') or (argtype == 'scalar_text') :
argres += j + '|'
else :
argres += str(j) + ','
argres = argres[0:-1]
if argtype == 'snippets' :
result += 'snippets:' + argres
else :
result += argres
if len(subtagList) > 0 :
result += '\n'
for j in subtagList:
if len(j) > 0 :
result += self.formatTag(j)
result += indent + '</' + nodename + '>\n'
else:
result += '</' + nodename + '>\n'
return result
# flatten tag
def flattenTag(self, node):
name = node[0]
subtagList = node[1]
argtype = node[2]
argList = node[3]
result = name
if (len(argList) > 0):
argres = ''
for j in argList:
if (argtype == 'text') or (argtype == 'scalar_text') :
argres += j + '|'
else :
argres += str(j) + '|'
argres = argres[0:-1]
if argtype == 'snippets' :
result += '.snippets=' + argres
else :
result += '=' + argres
result += '\n'
for j in subtagList:
if len(j) > 0 :
result += self.flattenTag(j)
return result
# reduce create xml output
def formatDoc(self, flat_xml):
result = ''
for j in self.doc :
if len(j) > 0:
if flat_xml:
result += self.flattenTag(j)
else:
result += self.formatTag(j)
if self.debug : print result
return result
# main loop - parse the page.dat files
# to create structured document and snippets
# FIXME: value at end of magic appears to be a subtags count
# but for what? For now, inject an 'info" tag as it is in
# every dictionary and seems close to what is meant
# The alternative is to special case the last _ "0x5f" to mean something
def process(self):
# peek at the first bytes to see what type of file it is
magic = self.fo.read(9)
if (magic[0:1] == 'p') and (magic[2:9] == 'marker_'):
first_token = 'info'
elif (magic[0:1] == 'p') and (magic[2:9] == '__PAGE_'):
skip = self.fo.read(2)
first_token = 'info'
elif (magic[0:1] == 'p') and (magic[2:8] == '_PAGE_'):
first_token = 'info'
elif (magic[0:1] == 'g') and (magic[2:9] == '__GLYPH'):
skip = self.fo.read(3)
first_token = 'info'
else :
# other0.dat file
first_token = None
self.fo.seek(-9,1)
# main loop to read and build the document tree
while True:
if first_token != None :
# use "inserted" first token 'info' for page and glyph files
tag = self.procToken(first_token)
if len(tag) > 0 :
self.doc.append(tag)
first_token = None
v = self.getNext()
if (v == None):
break
if (v == 0x72):
self.doLoop72('number')
elif (v > 0) and (v < self.dict.getSize()) :
tag = self.procToken(self.dict.lookup(v))
if len(tag) > 0 :
self.doc.append(tag)
else:
if self.debug:
print "Main Loop: Unknown value: %x" % v
if (v == 0):
if (self.peek(1) == 0x5f):
skip = self.fo.read(1)
first_token = 'info'
# now do snippet injection
if len(self.snippetList) > 0 :
if self.debug : print 'Injecting Snippets:'
snippet = self.injectSnippets(self.snippetList[0])
snipno = snippet[0]
tag_add = snippet[1]
if self.debug : print self.formatTag(tag_add)
if len(tag_add) > 0:
self.doc.append(tag_add)
# handle generation of xml output
xmlpage = self.formatDoc(self.flat_xml)
return xmlpage
def fromData(dict, fname):
flat_xml = True
debug = False
pp = PageParser(fname, dict, debug, flat_xml)
xmlpage = pp.process()
return xmlpage
def getXML(dict, fname):
flat_xml = False
debug = False
pp = PageParser(fname, dict, debug, flat_xml)
xmlpage = pp.process()
return xmlpage
def usage():
print 'Usage: '
print ' convert2xml.py dict0000.dat infile.dat '
print ' '
print ' Options:'
print ' -h print this usage help message '
print ' -d turn on debug output to check for potential errors '
print ' --flat-xml output the flattened xml page description only '
print ' '
print ' This program will attempt to convert a page*.dat file or '
print ' glyphs*.dat file, using the dict0000.dat file, to its xml description. '
print ' '
print ' Use "cmbtc_dump.py" first to unencrypt, uncompress, and dump '
print ' the *.dat files from a Topaz format e-book.'
#
# Main
#
def main(argv):
dictFile = ""
pageFile = ""
debug = False
flat_xml = False
printOutput = False
if len(argv) == 0:
printOutput = True
argv = sys.argv
try:
opts, args = getopt.getopt(argv[1:], "hd", ["flat-xml"])
except getopt.GetoptError, err:
# print help information and exit:
print str(err) # will print something like "option -a not recognized"
usage()
sys.exit(2)
if len(opts) == 0 and len(args) == 0 :
usage()
sys.exit(2)
for o, a in opts:
if o =="-d":
debug=True
if o =="-h":
usage()
sys.exit(0)
if o =="--flat-xml":
flat_xml = True
dictFile, pageFile = args[0], args[1]
# read in the string table dictionary
dict = Dictionary(dictFile)
# dict.dumpDict()
# create a page parser
pp = PageParser(pageFile, dict, debug, flat_xml)
xmlpage = pp.process()
if printOutput:
print xmlpage
return 0
return xmlpage
if __name__ == '__main__':
sys.exit(main(''))

View File

@@ -0,0 +1,706 @@
#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 2.6
import sys
import csv
import os
import math
import getopt
from struct import pack
from struct import unpack
class DocParser(object):
def __init__(self, flatxml, classlst, fileid, bookDir, gdict, fixedimage):
self.id = os.path.basename(fileid).replace('.dat','')
self.svgcount = 0
self.docList = flatxml.split('\n')
self.docSize = len(self.docList)
self.classList = {}
self.bookDir = bookDir
self.gdict = gdict
tmpList = classlst.split('\n')
for pclass in tmpList:
if pclass != '':
# remove the leading period from the css name
cname = pclass[1:]
self.classList[cname] = True
self.fixedimage = fixedimage
self.ocrtext = []
self.link_id = []
self.link_title = []
self.link_page = []
self.link_href = []
self.link_type = []
self.dehyphen_rootid = []
self.paracont_stemid = []
self.parastems_stemid = []
def getGlyph(self, gid):
result = ''
id='id="gl%d"' % gid
return self.gdict.lookup(id)
def glyphs_to_image(self, glyphList):
def extract(path, key):
b = path.find(key) + len(key)
e = path.find(' ',b)
return int(path[b:e])
svgDir = os.path.join(self.bookDir,'svg')
imgDir = os.path.join(self.bookDir,'img')
imgname = self.id + '_%04d.svg' % self.svgcount
imgfile = os.path.join(imgDir,imgname)
# get glyph information
gxList = self.getData('info.glyph.x',0,-1)
gyList = self.getData('info.glyph.y',0,-1)
gidList = self.getData('info.glyph.glyphID',0,-1)
gids = []
maxws = []
maxhs = []
xs = []
ys = []
gdefs = []
# get path defintions, positions, dimensions for ecah glyph
# that makes up the image, and find min x and min y to reposition origin
minx = -1
miny = -1
for j in glyphList:
gid = gidList[j]
gids.append(gid)
xs.append(gxList[j])
if minx == -1: minx = gxList[j]
else : minx = min(minx, gxList[j])
ys.append(gyList[j])
if miny == -1: miny = gyList[j]
else : miny = min(miny, gyList[j])
path = self.getGlyph(gid)
gdefs.append(path)
maxws.append(extract(path,'width='))
maxhs.append(extract(path,'height='))
# change the origin to minx, miny and calc max height and width
maxw = maxws[0] + xs[0] - minx
maxh = maxhs[0] + ys[0] - miny
for j in xrange(0, len(xs)):
xs[j] = xs[j] - minx
ys[j] = ys[j] - miny
maxw = max( maxw, (maxws[j] + xs[j]) )
maxh = max( maxh, (maxhs[j] + ys[j]) )
# open the image file for output
ifile = open(imgfile,'w')
ifile.write('<?xml version="1.0" standalone="no"?>\n')
ifile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n')
ifile.write('<svg width="%dpx" height="%dpx" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (math.floor(maxw/10), math.floor(maxh/10), maxw, maxh))
ifile.write('<defs>\n')
for j in xrange(0,len(gdefs)):
ifile.write(gdefs[j])
ifile.write('</defs>\n')
for j in xrange(0,len(gids)):
ifile.write('<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (gids[j], xs[j], ys[j]))
ifile.write('</svg>')
ifile.close()
return 0
# return tag at line pos in document
def lineinDoc(self, pos) :
if (pos >= 0) and (pos < self.docSize) :
item = self.docList[pos]
if item.find('=') >= 0:
(name, argres) = item.split('=',1)
else :
name = item
argres = ''
return name, argres
# find tag in doc if within pos to end inclusive
def findinDoc(self, tagpath, pos, end) :
result = None
if end == -1 :
end = self.docSize
else:
end = min(self.docSize, end)
foundat = -1
for j in xrange(pos, end):
item = self.docList[j]
if item.find('=') >= 0:
(name, argres) = item.split('=',1)
else :
name = item
argres = ''
if name.endswith(tagpath) :
result = argres
foundat = j
break
return foundat, result
# return list of start positions for the tagpath
def posinDoc(self, tagpath):
startpos = []
pos = 0
res = ""
while res != None :
(foundpos, res) = self.findinDoc(tagpath, pos, -1)
if res != None :
startpos.append(foundpos)
pos = foundpos + 1
return startpos
# returns a vector of integers for the tagpath
def getData(self, tagpath, pos, end):
argres=[]
(foundat, argt) = self.findinDoc(tagpath, pos, end)
if (argt != None) and (len(argt) > 0) :
argList = argt.split('|')
argres = [ int(strval) for strval in argList]
return argres
# get the class
def getClass(self, pclass):
nclass = pclass
# class names are an issue given topaz may start them with numerals (not allowed),
# use a mix of cases (which cause some browsers problems), and actually
# attach numbers after "_reclustered*" to the end to deal classeses that inherit
# from a base class (but then not actually provide all of these _reclustereed
# classes in the stylesheet!
# so we clean this up by lowercasing, prepend 'cl-', and getting any baseclass
# that exists in the stylesheet first, and then adding this specific class
# after
# also some class names have spaces in them so need to convert to dashes
if nclass != None :
nclass = nclass.replace(' ','-')
classres = ''
nclass = nclass.lower()
nclass = 'cl-' + nclass
baseclass = ''
# graphic is the base class for captions
if nclass.find('cl-cap-') >=0 :
classres = 'graphic' + ' '
else :
# strip to find baseclass
p = nclass.find('_')
if p > 0 :
baseclass = nclass[0:p]
if baseclass in self.classList:
classres += baseclass + ' '
classres += nclass
nclass = classres
return nclass
# develop a sorted description of the starting positions of
# groups and regions on the page, as well as the page type
def PageDescription(self):
def compare(x, y):
(xtype, xval) = x
(ytype, yval) = y
if xval > yval:
return 1
if xval == yval:
return 0
return -1
result = []
(pos, pagetype) = self.findinDoc('page.type',0,-1)
groupList = self.posinDoc('page.group')
groupregionList = self.posinDoc('page.group.region')
pageregionList = self.posinDoc('page.region')
# integrate into one list
for j in groupList:
result.append(('grpbeg',j))
for j in groupregionList:
result.append(('gregion',j))
for j in pageregionList:
result.append(('pregion',j))
result.sort(compare)
# insert group end and page end indicators
inGroup = False
j = 0
while True:
if j == len(result): break
rtype = result[j][0]
rval = result[j][1]
if not inGroup and (rtype == 'grpbeg') :
inGroup = True
j = j + 1
elif inGroup and (rtype in ('grpbeg', 'pregion')):
result.insert(j,('grpend',rval))
inGroup = False
else:
j = j + 1
if inGroup:
result.append(('grpend',-1))
result.append(('pageend', -1))
return pagetype, result
# build a description of the paragraph
def getParaDescription(self, start, end, regtype):
result = []
# paragraph
(pos, pclass) = self.findinDoc('paragraph.class',start,end)
pclass = self.getClass(pclass)
# build up a description of the paragraph in result and return it
# first check for the basic - all words paragraph
(pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
(pos, slast) = self.findinDoc('paragraph.lastWord',start,end)
if (sfirst != None) and (slast != None) :
first = int(sfirst)
last = int(slast)
makeImage = (regtype == 'vertical') or (regtype == 'table')
if self.fixedimage:
makeImage = makeImage or (regtype == 'fixed')
if (pclass != None):
makeImage = makeImage or (pclass.find('.inverted') >= 0)
if self.fixedimage :
makeImage = makeImage or (pclass.find('cl-f-') >= 0)
if not makeImage :
# standard all word paragraph
for wordnum in xrange(first, last):
result.append(('ocr', wordnum))
return pclass, result
# convert paragraph to svg image
# translate first and last word into first and last glyphs
# and generate inline image and include it
glyphList = []
firstglyphList = self.getData('word.firstGlyph',0,-1)
gidList = self.getData('info.glyph.glyphID',0,-1)
firstGlyph = firstglyphList[first]
if last < len(firstglyphList):
lastGlyph = firstglyphList[last]
else :
lastGlyph = len(gidList)
for glyphnum in xrange(firstGlyph, lastGlyph):
glyphList.append(glyphnum)
# include any extratokens if they exist
(pos, sfg) = self.findinDoc('extratokens.firstGlyph',start,end)
(pos, slg) = self.findinDoc('extratokens.lastGlyph',start,end)
if (sfg != None) and (slg != None):
for glyphnum in xrange(int(sfg), int(slg)):
glyphList.append(glyphnum)
num = self.svgcount
self.glyphs_to_image(glyphList)
self.svgcount += 1
result.append(('svg', num))
return pclass, result
# this type of paragraph may be made up of multiple spans, inline
# word monograms (images), and words with semantic meaning,
# plus glyphs used to form starting letter of first word
# need to parse this type line by line
line = start + 1
word_class = ''
# if end is -1 then we must search to end of document
if end == -1 :
end = self.docSize
# seems some xml has last* coming before first* so we have to
# handle any order
sp_first = -1
sp_last = -1
gl_first = -1
gl_last = -1
ws_first = -1
ws_last = -1
word_class = ''
while (line < end) :
(name, argres) = self.lineinDoc(line)
if name.endswith('span.firstWord') :
sp_first = int(argres)
elif name.endswith('span.lastWord') :
sp_last = int(argres)
elif name.endswith('word.firstGlyph') :
gl_first = int(argres)
elif name.endswith('word.lastGlyph') :
gl_last = int(argres)
elif name.endswith('word_semantic.firstWord'):
ws_first = int(argres)
elif name.endswith('word_semantic.lastWord'):
ws_last = int(argres)
elif name.endswith('word.class'):
(cname, space) = argres.split('-',1)
if space == '' : space = '0'
if (cname == 'spaceafter') and (int(space) > 0) :
word_class = 'sa'
elif name.endswith('word.img.src'):
result.append(('img' + word_class, int(argres)))
word_class = ''
elif name.endswith('region.img.src'):
result.append(('img' + word_class, int(argres)))
if (sp_first != -1) and (sp_last != -1):
for wordnum in xrange(sp_first, sp_last):
result.append(('ocr', wordnum))
sp_first = -1
sp_last = -1
if (gl_first != -1) and (gl_last != -1):
glyphList = []
for glyphnum in xrange(gl_first, gl_last):
glyphList.append(glyphnum)
num = self.svgcount
self.glyphs_to_image(glyphList)
self.svgcount += 1
result.append(('svg', num))
gl_first = -1
gl_last = -1
if (ws_first != -1) and (ws_last != -1):
for wordnum in xrange(ws_first, ws_last):
result.append(('ocr', wordnum))
ws_first = -1
ws_last = -1
line += 1
return pclass, result
def buildParagraph(self, pclass, pdesc, type, regtype) :
parares = ''
sep =''
classres = ''
if pclass :
classres = ' class="' + pclass + '"'
br_lb = (regtype == 'fixed') or (regtype == 'chapterheading') or (regtype == 'vertical')
handle_links = len(self.link_id) > 0
if (type == 'full') or (type == 'begin') :
parares += '<p' + classres + '>'
if (type == 'end'):
parares += ' '
lstart = len(parares)
cnt = len(pdesc)
for j in xrange( 0, cnt) :
(wtype, num) = pdesc[j]
if wtype == 'ocr' :
word = self.ocrtext[num]
sep = ' '
if handle_links:
link = self.link_id[num]
if (link > 0):
linktype = self.link_type[link-1]
title = self.link_title[link-1]
if (title == "") or (parares.rfind(title) < 0):
title=parares[lstart:]
if linktype == 'external' :
linkhref = self.link_href[link-1]
linkhtml = '<a href="%s">' % linkhref
else :
if len(self.link_page) >= link :
ptarget = self.link_page[link-1] - 1
linkhtml = '<a href="#page%04d">' % ptarget
else :
# just link to the current page
linkhtml = '<a href="#' + self.id + '">'
linkhtml += title + '</a>'
pos = parares.rfind(title)
if pos >= 0:
parares = parares[0:pos] + linkhtml + parares[pos+len(title):]
else :
parares += linkhtml
lstart = len(parares)
if word == '_link_' : word = ''
elif (link < 0) :
if word == '_link_' : word = ''
if word == '_lb_':
if ((num-1) in self.dehyphen_rootid ) or handle_links:
word = ''
sep = ''
elif br_lb :
word = '<br />\n'
sep = ''
else :
word = '\n'
sep = ''
if num in self.dehyphen_rootid :
word = word[0:-1]
sep = ''
parares += word + sep
elif wtype == 'img' :
sep = ''
parares += '<img src="img/img%04d.jpg" alt="" />' % num
parares += sep
elif wtype == 'imgsa' :
sep = ' '
parares += '<img src="img/img%04d.jpg" alt="" />' % num
parares += sep
elif wtype == 'svg' :
sep = ''
parares += '<img src="img/' + self.id + '_%04d.svg" alt="" />' % num
parares += sep
if len(sep) > 0 : parares = parares[0:-1]
if (type == 'full') or (type == 'end') :
parares += '</p>'
return parares
# walk the document tree collecting the information needed
# to build an html page using the ocrText
def process(self):
htmlpage = ''
# get the ocr text
(pos, argres) = self.findinDoc('info.word.ocrText',0,-1)
if argres : self.ocrtext = argres.split('|')
# get information to dehyphenate the text
self.dehyphen_rootid = self.getData('info.dehyphen.rootID',0,-1)
# determine if first paragraph is continued from previous page
(pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1)
first_para_continued = (self.parastems_stemid != None)
# determine if last paragraph is continued onto the next page
(pos, self.paracont_stemid) = self.findinDoc('info.paraCont.stemID',0,-1)
last_para_continued = (self.paracont_stemid != None)
# collect link ids
self.link_id = self.getData('info.word.link_id',0,-1)
# collect link destination page numbers
self.link_page = self.getData('info.links.page',0,-1)
# collect link types (container versus external)
(pos, argres) = self.findinDoc('info.links.type',0,-1)
if argres : self.link_type = argres.split('|')
# collect link destinations
(pos, argres) = self.findinDoc('info.links.href',0,-1)
if argres : self.link_href = argres.split('|')
# collect link titles
(pos, argres) = self.findinDoc('info.links.title',0,-1)
if argres :
self.link_title = argres.split('|')
else:
self.link_title.append('')
# get a descriptions of the starting points of the regions
# and groups on the page
(pagetype, pageDesc) = self.PageDescription()
regcnt = len(pageDesc) - 1
anchorSet = False
breakSet = False
inGroup = False
# process each region on the page and convert what you can to html
for j in xrange(regcnt):
(etype, start) = pageDesc[j]
(ntype, end) = pageDesc[j+1]
# set anchor for link target on this page
if not anchorSet and not first_para_continued:
htmlpage += '<div style="visibility: hidden; height: 0; width: 0;" id="'
htmlpage += self.id + '" title="pagetype_' + pagetype + '"></div>\n'
anchorSet = True
# handle groups of graphics with text captions
if (etype == 'grpbeg'):
(pos, grptype) = self.findinDoc('group.type', start, end)
if grptype != None:
if grptype == 'graphic':
gcstr = ' class="' + grptype + '"'
htmlpage += '<div' + gcstr + '>'
inGroup = True
elif (etype == 'grpend'):
if inGroup:
htmlpage += '</div>\n'
inGroup = False
else:
(pos, regtype) = self.findinDoc('region.type',start,end)
if regtype == 'graphic' :
(pos, simgsrc) = self.findinDoc('img.src',start,end)
if simgsrc:
if inGroup:
htmlpage += '<img src="img/img%04d.jpg" alt="" />' % int(simgsrc)
else:
htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
elif regtype == 'chapterheading' :
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
if not breakSet:
htmlpage += '<div style="page-break-after: always;">&nbsp;</div>\n'
breakSet = True
tag = 'h1'
if pclass and (len(pclass) >= 7):
if pclass[3:7] == 'ch1-' : tag = 'h1'
if pclass[3:7] == 'ch2-' : tag = 'h2'
if pclass[3:7] == 'ch3-' : tag = 'h3'
htmlpage += '<' + tag + ' class="' + pclass + '">'
else:
htmlpage += '<' + tag + '>'
htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
htmlpage += '</' + tag + '>'
elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') or (regtype == 'listitem'):
ptype = 'full'
# check to see if this is a continution from the previous page
if first_para_continued :
ptype = 'end'
first_para_continued = False
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
if pclass and (len(pclass) >= 6) and (ptype == 'full'):
tag = 'p'
if pclass[3:6] == 'h1-' : tag = 'h4'
if pclass[3:6] == 'h2-' : tag = 'h5'
if pclass[3:6] == 'h3-' : tag = 'h6'
htmlpage += '<' + tag + ' class="' + pclass + '">'
htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
htmlpage += '</' + tag + '>'
else :
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
elif (regtype == 'tocentry') :
ptype = 'full'
if first_para_continued :
ptype = 'end'
first_para_continued = False
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
elif (regtype == 'vertical') or (regtype == 'table') :
ptype = 'full'
if inGroup:
ptype = 'middle'
if first_para_continued :
ptype = 'end'
first_para_continued = False
(pclass, pdesc) = self.getParaDescription(start, end, regtype)
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
elif (regtype == 'synth_fcvr.center'):
(pos, simgsrc) = self.findinDoc('img.src',start,end)
if simgsrc:
htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
else :
print ' Making region type', regtype,
(pos, temp) = self.findinDoc('paragraph',start,end)
(pos2, temp) = self.findinDoc('span',start,end)
if pos != -1 or pos2 != -1:
print ' a "text" region'
orig_regtype = regtype
regtype = 'fixed'
ptype = 'full'
# check to see if this is a continution from the previous page
if first_para_continued :
ptype = 'end'
first_para_continued = False
(pclass, pdesc) = self.getParaDescription(start,end, regtype)
if not pclass:
if orig_regtype.endswith('.right') : pclass = 'cl-right'
elif orig_regtype.endswith('.center') : pclass = 'cl-center'
elif orig_regtype.endswith('.left') : pclass = 'cl-left'
elif orig_regtype.endswith('.justify') : pclass = 'cl-justify'
if pclass and (ptype == 'full') and (len(pclass) >= 6):
tag = 'p'
if pclass[3:6] == 'h1-' : tag = 'h4'
if pclass[3:6] == 'h2-' : tag = 'h5'
if pclass[3:6] == 'h3-' : tag = 'h6'
htmlpage += '<' + tag + ' class="' + pclass + '">'
htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
htmlpage += '</' + tag + '>'
else :
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
else :
print ' a "graphic" region'
(pos, simgsrc) = self.findinDoc('img.src',start,end)
if simgsrc:
htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
if last_para_continued :
if htmlpage[-4:] == '</p>':
htmlpage = htmlpage[0:-4]
last_para_continued = False
return htmlpage
def convert2HTML(flatxml, classlst, fileid, bookDir, gdict, fixedimage):
# create a document parser
dp = DocParser(flatxml, classlst, fileid, bookDir, gdict, fixedimage)
htmlpage = dp.process()
return htmlpage

View File

@@ -0,0 +1,151 @@
#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
import sys
import csv
import os
import getopt
from struct import pack
from struct import unpack
class PParser(object):
def __init__(self, gd, flatxml):
self.gd = gd
self.flatdoc = flatxml.split('\n')
self.temp = []
foo = self.getData('page.h') or self.getData('book.h')
self.ph = foo[0]
foo = self.getData('page.w') or self.getData('book.w')
self.pw = foo[0]
self.gx = self.getData('info.glyph.x')
self.gy = self.getData('info.glyph.y')
self.gid = self.getData('info.glyph.glyphID')
def getData(self, path):
result = None
cnt = len(self.flatdoc)
for j in xrange(cnt):
item = self.flatdoc[j]
if item.find('=') >= 0:
(name, argt) = item.split('=')
argres = argt.split('|')
else:
name = item
argres = []
if (name.endswith(path)):
result = argres
break
if (len(argres) > 0) :
for j in xrange(0,len(argres)):
argres[j] = int(argres[j])
return result
def getDataTemp(self, path):
result = None
cnt = len(self.temp)
for j in xrange(cnt):
item = self.temp[j]
if item.find('=') >= 0:
(name, argt) = item.split('=')
argres = argt.split('|')
else:
name = item
argres = []
if (name.endswith(path)):
result = argres
self.temp.pop(j)
break
if (len(argres) > 0) :
for j in xrange(0,len(argres)):
argres[j] = int(argres[j])
return result
def getImages(self):
result = []
self.temp = self.flatdoc
while (self.getDataTemp('img') != None):
h = self.getDataTemp('img.h')[0]
w = self.getDataTemp('img.w')[0]
x = self.getDataTemp('img.x')[0]
y = self.getDataTemp('img.y')[0]
src = self.getDataTemp('img.src')[0]
result.append('<image xlink:href="../img/img%04d.jpg" x="%d" y="%d" width="%d" height="%d" />\n' % (src, x, y, w, h))
return result
def getGlyphs(self):
result = []
if (self.gid != None) and (len(self.gid) > 0):
glyphs = []
for j in set(self.gid):
glyphs.append(j)
glyphs.sort()
for gid in glyphs:
id='id="gl%d"' % gid
path = self.gd.lookup(id)
if path:
result.append(id + ' ' + path)
return result
def convert2SVG(gdict, flat_xml, counter, numfiles, svgDir, raw, meta_array, scaledpi):
ml = ''
pp = PParser(gdict, flat_xml)
ml += '<?xml version="1.0" standalone="no"?>\n'
if (raw):
ml += '<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n'
ml += '<svg width="%fin" height="%fin" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1)
ml += '<title>Page %d - %s by %s</title>\n' % (counter, meta_array['Title'],meta_array['Authors'])
else:
ml += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
ml += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" ><head>\n'
ml += '<title>Page %d - %s by %s</title>\n' % (counter, meta_array['Title'],meta_array['Authors'])
ml += '<script><![CDATA[\n'
ml += 'function gd(){var p=window.location.href.replace(/^.*\?dpi=(\d+).*$/i,"$1");return p;}\n'
ml += 'var dpi=%d;\n' % scaledpi
if (counter) :
ml += 'var prevpage="page%04d.xhtml";\n' % (counter - 1)
if (counter < numfiles-1) :
ml += 'var nextpage="page%04d.xhtml";\n' % (counter + 1)
ml += 'var pw=%d;var ph=%d;' % (pp.pw, pp.ph)
ml += 'function zoomin(){dpi=dpi*(0.8);setsize();}\n'
ml += 'function zoomout(){dpi=dpi*1.25;setsize();}\n'
ml += 'function setsize(){var svg=document.getElementById("svgimg");var prev=document.getElementById("prevsvg");var next=document.getElementById("nextsvg");var width=(pw/dpi)+"in";var height=(ph/dpi)+"in";svg.setAttribute("width",width);svg.setAttribute("height",height);prev.setAttribute("height",height);prev.setAttribute("width","50px");next.setAttribute("height",height);next.setAttribute("width","50px");}\n'
ml += 'function ppage(){window.location.href=prevpage+"?dpi="+Math.round(dpi);}\n'
ml += 'function npage(){window.location.href=nextpage+"?dpi="+Math.round(dpi);}\n'
ml += 'var gt=gd();if(gt>0){dpi=gt;}\n'
ml += 'window.onload=setsize;\n'
ml += ']]></script>\n'
ml += '</head>\n'
ml += '<body onLoad="setsize();" style="background-color:#777;text-align:center;">\n'
ml += '<div style="white-space:nowrap;">\n'
if (counter == 0) :
ml += '<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n'
else:
ml += '<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,150,95,5,95,295" fill="#AAAAAA" /></svg></a>\n'
ml += '<a href="javascript:npage();"><svg id="svgimg" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" style="background-color:#FFF;border:1px solid black;">' % (pp.pw, pp.ph)
if (pp.gid != None):
ml += '<defs>\n'
gdefs = pp.getGlyphs()
for j in xrange(0,len(gdefs)):
ml += gdefs[j]
ml += '</defs>\n'
img = pp.getImages()
if (img != None):
for j in xrange(0,len(img)):
ml += img[j]
if (pp.gid != None):
for j in xrange(0,len(pp.gid)):
ml += '<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (pp.gid[j], pp.gx[j], pp.gy[j])
if (img == None or len(img) == 0) and (pp.gid == None or len(pp.gid) == 0):
ml += '<text x="10" y="10" font-family="Helvetica" font-size="100" stroke="black">This page intentionally left blank.</text>\n<text x="10" y="110" font-family="Helvetica" font-size="50" stroke="black">Until this notice unintentionally gave it content. (gensvg.py)</text>\n'
if (raw) :
ml += '</svg>'
else :
ml += '</svg></a>\n'
if (counter == numfiles - 1) :
ml += '<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n'
else :
ml += '<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,5,5,295,95,150" fill="#AAAAAA" /></svg></a>\n'
ml += '</div>\n'
ml += '<div><a href="javascript:zoomin();">zoom in</a> - <a href="javascript:zoomout();">zoom out</a></div>\n'
ml += '</body>\n'
ml += '</html>\n'
return ml

View File

@@ -0,0 +1,561 @@
#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
class Unbuffered:
def __init__(self, stream):
self.stream = stream
def write(self, data):
self.stream.write(data)
self.stream.flush()
def __getattr__(self, attr):
return getattr(self.stream, attr)
import sys
sys.stdout=Unbuffered(sys.stdout)
import csv
import os
import getopt
from struct import pack
from struct import unpack
# local support routines
import convert2xml
import flatxml2html
import flatxml2svg
import stylexml2css
# Get a 7 bit encoded number from a file
def readEncodedNumber(file):
flag = False
c = file.read(1)
if (len(c) == 0):
return None
data = ord(c)
if data == 0xFF:
flag = True
c = file.read(1)
if (len(c) == 0):
return None
data = ord(c)
if data >= 0x80:
datax = (data & 0x7F)
while data >= 0x80 :
c = file.read(1)
if (len(c) == 0):
return None
data = ord(c)
datax = (datax <<7) + (data & 0x7F)
data = datax
if flag:
data = -data
return data
# Get a length prefixed string from the file
def lengthPrefixString(data):
return encodeNumber(len(data))+data
def readString(file):
stringLength = readEncodedNumber(file)
if (stringLength == None):
return None
sv = file.read(stringLength)
if (len(sv) != stringLength):
return ""
return unpack(str(stringLength)+"s",sv)[0]
def getMetaArray(metaFile):
# parse the meta file
result = {}
fo = file(metaFile,'rb')
size = readEncodedNumber(fo)
for i in xrange(size):
tag = readString(fo)
value = readString(fo)
result[tag] = value
# print tag, value
fo.close()
return result
# dictionary of all text strings by index value
class Dictionary(object):
def __init__(self, dictFile):
self.filename = dictFile
self.size = 0
self.fo = file(dictFile,'rb')
self.stable = []
self.size = readEncodedNumber(self.fo)
for i in xrange(self.size):
self.stable.append(self.escapestr(readString(self.fo)))
self.pos = 0
def escapestr(self, str):
str = str.replace('&','&amp;')
str = str.replace('<','&lt;')
str = str.replace('>','&gt;')
str = str.replace('=','&#61;')
return str
def lookup(self,val):
if ((val >= 0) and (val < self.size)) :
self.pos = val
return self.stable[self.pos]
else:
print "Error - %d outside of string table limits" % val
sys.exit(-1)
def getSize(self):
return self.size
def getPos(self):
return self.pos
class PageDimParser(object):
def __init__(self, flatxml):
self.flatdoc = flatxml.split('\n')
# find tag if within pos to end inclusive
def findinDoc(self, tagpath, pos, end) :
result = None
docList = self.flatdoc
cnt = len(docList)
if end == -1 :
end = cnt
else:
end = min(cnt,end)
foundat = -1
for j in xrange(pos, end):
item = docList[j]
if item.find('=') >= 0:
(name, argres) = item.split('=')
else :
name = item
argres = ''
if name.endswith(tagpath) :
result = argres
foundat = j
break
return foundat, result
def process(self):
(pos, sph) = self.findinDoc('page.h',0,-1)
(pos, spw) = self.findinDoc('page.w',0,-1)
if (sph == None): sph = '-1'
if (spw == None): spw = '-1'
return sph, spw
def getPageDim(flatxml):
# create a document parser
dp = PageDimParser(flatxml)
(ph, pw) = dp.process()
return ph, pw
class GParser(object):
def __init__(self, flatxml):
self.flatdoc = flatxml.split('\n')
self.dpi = 1440
self.gh = self.getData('info.glyph.h')
self.gw = self.getData('info.glyph.w')
self.guse = self.getData('info.glyph.use')
if self.guse :
self.count = len(self.guse)
else :
self.count = 0
self.gvtx = self.getData('info.glyph.vtx')
self.glen = self.getData('info.glyph.len')
self.gdpi = self.getData('info.glyph.dpi')
self.vx = self.getData('info.vtx.x')
self.vy = self.getData('info.vtx.y')
self.vlen = self.getData('info.len.n')
if self.vlen :
self.glen.append(len(self.vlen))
elif self.glen:
self.glen.append(0)
if self.vx :
self.gvtx.append(len(self.vx))
elif self.gvtx :
self.gvtx.append(0)
def getData(self, path):
result = None
cnt = len(self.flatdoc)
for j in xrange(cnt):
item = self.flatdoc[j]
if item.find('=') >= 0:
(name, argt) = item.split('=')
argres = argt.split('|')
else:
name = item
argres = []
if (name == path):
result = argres
break
if (len(argres) > 0) :
for j in xrange(0,len(argres)):
argres[j] = int(argres[j])
return result
def getGlyphDim(self, gly):
maxh = (self.gh[gly] * self.dpi) / self.gdpi[gly]
maxw = (self.gw[gly] * self.dpi) / self.gdpi[gly]
return maxh, maxw
def getPath(self, gly):
path = ''
if (gly < 0) or (gly >= self.count):
return path
tx = self.vx[self.gvtx[gly]:self.gvtx[gly+1]]
ty = self.vy[self.gvtx[gly]:self.gvtx[gly+1]]
p = 0
for k in xrange(self.glen[gly], self.glen[gly+1]):
if (p == 0):
zx = tx[0:self.vlen[k]+1]
zy = ty[0:self.vlen[k]+1]
else:
zx = tx[self.vlen[k-1]+1:self.vlen[k]+1]
zy = ty[self.vlen[k-1]+1:self.vlen[k]+1]
p += 1
j = 0
while ( j < len(zx) ):
if (j == 0):
# Start Position.
path += 'M %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly])
elif (j <= len(zx)-3):
# Cubic Bezier Curve
path += 'C %d %d %d %d %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly], zx[j+1] * self.dpi / self.gdpi[gly], zy[j+1] * self.dpi / self.gdpi[gly], zx[j+2] * self.dpi / self.gdpi[gly], zy[j+2] * self.dpi / self.gdpi[gly])
j += 2
elif (j == len(zx)-2):
# Cubic Bezier Curve to Start Position
path += 'C %d %d %d %d %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly], zx[j+1] * self.dpi / self.gdpi[gly], zy[j+1] * self.dpi / self.gdpi[gly], zx[0] * self.dpi / self.gdpi[gly], zy[0] * self.dpi / self.gdpi[gly])
j += 1
elif (j == len(zx)-1):
# Quadratic Bezier Curve to Start Position
path += 'Q %d %d %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly], zx[0] * self.dpi / self.gdpi[gly], zy[0] * self.dpi / self.gdpi[gly])
j += 1
path += 'z'
return path
# dictionary of all text strings by index value
class GlyphDict(object):
def __init__(self):
self.gdict = {}
def lookup(self, id):
# id='id="gl%d"' % val
if id in self.gdict:
return self.gdict[id]
return None
def addGlyph(self, val, path):
id='id="gl%d"' % val
self.gdict[id] = path
def generateBook(bookDir, raw, fixedimage):
# sanity check Topaz file extraction
if not os.path.exists(bookDir) :
print "Can not find directory with unencrypted book"
return 1
dictFile = os.path.join(bookDir,'dict0000.dat')
if not os.path.exists(dictFile) :
print "Can not find dict0000.dat file"
return 1
pageDir = os.path.join(bookDir,'page')
if not os.path.exists(pageDir) :
print "Can not find page directory in unencrypted book"
return 1
imgDir = os.path.join(bookDir,'img')
if not os.path.exists(imgDir) :
print "Can not find image directory in unencrypted book"
return 1
glyphsDir = os.path.join(bookDir,'glyphs')
if not os.path.exists(glyphsDir) :
print "Can not find glyphs directory in unencrypted book"
return 1
metaFile = os.path.join(bookDir,'metadata0000.dat')
if not os.path.exists(metaFile) :
print "Can not find metadata0000.dat in unencrypted book"
return 1
svgDir = os.path.join(bookDir,'svg')
if not os.path.exists(svgDir) :
os.makedirs(svgDir)
xmlDir = os.path.join(bookDir,'xml')
if not os.path.exists(xmlDir) :
os.makedirs(xmlDir)
otherFile = os.path.join(bookDir,'other0000.dat')
if not os.path.exists(otherFile) :
print "Can not find other0000.dat in unencrypted book"
return 1
print "Updating to color images if available"
spath = os.path.join(bookDir,'color_img')
dpath = os.path.join(bookDir,'img')
filenames = os.listdir(spath)
filenames = sorted(filenames)
for filename in filenames:
imgname = filename.replace('color','img')
sfile = os.path.join(spath,filename)
dfile = os.path.join(dpath,imgname)
imgdata = file(sfile,'rb').read()
file(dfile,'wb').write(imgdata)
print "Creating cover.jpg"
isCover = False
cpath = os.path.join(bookDir,'img')
cpath = os.path.join(cpath,'img0000.jpg')
if os.path.isfile(cpath):
cover = file(cpath, 'rb').read()
cpath = os.path.join(bookDir,'cover.jpg')
file(cpath, 'wb').write(cover)
isCover = True
print 'Processing Dictionary'
dict = Dictionary(dictFile)
print 'Processing Meta Data and creating OPF'
meta_array = getMetaArray(metaFile)
xname = os.path.join(xmlDir, 'metadata.xml')
metastr = ''
for key in meta_array:
metastr += '<meta name="' + key + '" content="' + meta_array[key] + '" />\n'
file(xname, 'wb').write(metastr)
print 'Processing StyleSheet'
# get some scaling info from metadata to use while processing styles
fontsize = '135'
if 'fontSize' in meta_array:
fontsize = meta_array['fontSize']
# also get the size of a normal text page
spage = '1'
if 'firstTextPage' in meta_array:
spage = meta_array['firstTextPage']
pnum = int(spage)
# get page height and width from first text page for use in stylesheet scaling
pname = 'page%04d.dat' % (pnum + 1)
fname = os.path.join(pageDir,pname)
flat_xml = convert2xml.fromData(dict, fname)
(ph, pw) = getPageDim(flat_xml)
if (ph == '-1') or (ph == '0') : ph = '11000'
if (pw == '-1') or (pw == '0') : pw = '8500'
# print ' ', 'other0000.dat'
xname = os.path.join(bookDir, 'style.css')
flat_xml = convert2xml.fromData(dict, otherFile)
cssstr , classlst = stylexml2css.convert2CSS(flat_xml, fontsize, ph, pw)
file(xname, 'wb').write(cssstr)
xname = os.path.join(xmlDir, 'other0000.xml')
file(xname, 'wb').write(convert2xml.getXML(dict, otherFile))
print 'Processing Glyphs'
gd = GlyphDict()
filenames = os.listdir(glyphsDir)
filenames = sorted(filenames)
glyfname = os.path.join(svgDir,'glyphs.svg')
glyfile = open(glyfname, 'w')
glyfile.write('<?xml version="1.0" standalone="no"?>\n')
glyfile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n')
glyfile.write('<svg width="512" height="512" viewBox="0 0 511 511" xmlns="http://www.w3.org/2000/svg" version="1.1">\n')
glyfile.write('<title>Glyphs for %s</title>\n' % meta_array['Title'])
glyfile.write('<defs>\n')
counter = 0
for filename in filenames:
# print ' ', filename
print '.',
fname = os.path.join(glyphsDir,filename)
flat_xml = convert2xml.fromData(dict, fname)
xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
file(xname, 'wb').write(convert2xml.getXML(dict, fname))
gp = GParser(flat_xml)
for i in xrange(0, gp.count):
path = gp.getPath(i)
maxh, maxw = gp.getGlyphDim(i)
fullpath = '<path id="gl%d" d="%s" fill="black" /><!-- width=%d height=%d -->\n' % (counter * 256 + i, path, maxw, maxh)
glyfile.write(fullpath)
gd.addGlyph(counter * 256 + i, fullpath)
counter += 1
glyfile.write('</defs>\n')
glyfile.write('</svg>\n')
glyfile.close()
print " "
# start up the html
htmlFileName = "book.html"
htmlstr = '<?xml version="1.0" encoding="utf-8"?>\n'
htmlstr += '<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.1 Strict//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11-strict.dtd">\n'
htmlstr += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">\n'
htmlstr += '<head>\n'
htmlstr += '<meta http-equiv="content-type" content="text/html; charset=utf-8"/>\n'
htmlstr += '<title>' + meta_array['Title'] + ' by ' + meta_array['Authors'] + '</title>\n'
htmlstr += '<meta name="Author" content="' + meta_array['Authors'] + '" />\n'
htmlstr += '<meta name="Title" content="' + meta_array['Title'] + '" />\n'
htmlstr += '<meta name="ASIN" content="' + meta_array['ASIN'] + '" />\n'
htmlstr += '<meta name="GUID" content="' + meta_array['GUID'] + '" />\n'
htmlstr += '<link href="style.css" rel="stylesheet" type="text/css" />\n'
htmlstr += '</head>\n<body>\n'
print 'Processing Pages'
# Books are at 1440 DPI. This is rendering at twice that size for
# readability when rendering to the screen.
scaledpi = 1440.0
svgindex = '<?xml version="1.0" encoding="utf-8"?>\n'
svgindex += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
svgindex += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >'
svgindex += '<head>\n'
svgindex += '<title>' + meta_array['Title'] + '</title>\n'
svgindex += '<meta name="Author" content="' + meta_array['Authors'] + '" />\n'
svgindex += '<meta name="Title" content="' + meta_array['Title'] + '" />\n'
svgindex += '<meta name="ASIN" content="' + meta_array['ASIN'] + '" />\n'
svgindex += '<meta name="GUID" content="' + meta_array['GUID'] + '" />\n'
svgindex += '</head>\n'
svgindex += '<body>\n'
filenames = os.listdir(pageDir)
filenames = sorted(filenames)
numfiles = len(filenames)
counter = 0
for filename in filenames:
# print ' ', filename
print ".",
fname = os.path.join(pageDir,filename)
flat_xml = convert2xml.fromData(dict, fname)
xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
file(xname, 'wb').write(convert2xml.getXML(dict, fname))
# first get the html
htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir, gd, fixedimage)
# now get the svg image of the page
svgxml = flatxml2svg.convert2SVG(gd, flat_xml, counter, numfiles, svgDir, raw, meta_array, scaledpi)
if (raw) :
pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w')
svgindex += '<a href="svg/page%04d.svg">Page %d</a>\n' % (counter, counter)
else :
pfile = open(os.path.join(svgDir,'page%04d.xhtml' % counter), 'w')
svgindex += '<a href="svg/page%04d.xhtml">Page %d</a>\n' % (counter, counter)
pfile.write(svgxml)
pfile.close()
counter += 1
print " "
# finish up the html string and output it
htmlstr += '</body>\n</html>\n'
file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
# finish up the svg index string and output it
svgindex += '</body>\n</html>\n'
file(os.path.join(bookDir, 'index_svg.xhtml'), 'wb').write(svgindex)
# build the opf file
opfname = os.path.join(bookDir, 'book.opf')
opfstr = '<?xml version="1.0" encoding="utf-8"?>\n'
opfstr += '<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="guid_id">\n'
# adding metadata
opfstr += ' <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">\n'
opfstr += ' <dc:identifier opf:scheme="GUID" id="guid_id">' + meta_array['GUID'] + '</dc:identifier>\n'
opfstr += ' <dc:identifier opf:scheme="ASIN">' + meta_array['ASIN'] + '</dc:identifier>\n'
opfstr += ' <dc:identifier opf:scheme="oASIN">' + meta_array['oASIN'] + '</dc:identifier>\n'
opfstr += ' <dc:title>' + meta_array['Title'] + '</dc:title>\n'
opfstr += ' <dc:creator opf:role="aut">' + meta_array['Authors'] + '</dc:creator>\n'
opfstr += ' <dc:language>en</dc:language>\n'
opfstr += ' <dc:date>' + meta_array['UpdateTime'] + '</dc:date>\n'
if isCover:
opfstr += ' <meta name="cover" content="bookcover"/>\n'
opfstr += ' </metadata>\n'
opfstr += '<manifest>\n'
opfstr += ' <item id="book" href="book.html" media-type="application/xhtml+xml"/>\n'
opfstr += ' <item id="stylesheet" href="style.css" media-type="text.css"/>\n'
# adding image files to manifest
filenames = os.listdir(imgDir)
filenames = sorted(filenames)
for filename in filenames:
imgname, imgext = os.path.splitext(filename)
if imgext == '.jpg':
imgext = 'jpeg'
if imgext == '.svg':
imgext = 'svg+xml'
opfstr += ' <item id="' + imgname + '" href="img/' + filename + '" media-type="image/' + imgext + '"/>\n'
if isCover:
opfstr += ' <item id="bookcover" href="cover.jpg" media-type="image/jpeg" />\n'
opfstr += '</manifest>\n'
# adding spine
opfstr += '<spine>\n <itemref idref="book" />\n</spine>\n'
if isCover:
opfstr += ' <guide>\n'
opfstr += ' <reference href="cover.jpg" type="cover" title="Cover"/>\n'
opfstr += ' </guide>\n'
opfstr += '</package>\n'
file(opfname, 'wb').write(opfstr)
print 'Processing Complete'
return 0
def usage():
print "genbook.py generates a book from the extract Topaz Files"
print "Usage:"
print " genbook.py [-r] [-h [--fixed-image] <bookDir> "
print " "
print "Options:"
print " -h : help - print this usage message"
print " -r : generate raw svg files (not wrapped in xhtml)"
print " --fixed-image : genearate any Fixed Area as an svg image in the html"
print " "
def main(argv):
bookDir = ''
if len(argv) == 0:
argv = sys.argv
try:
opts, args = getopt.getopt(argv[1:], "rh:",["fixed-image"])
except getopt.GetoptError, err:
print str(err)
usage()
return 1
if len(opts) == 0 and len(args) == 0 :
usage()
return 1
raw = 0
fixedimage = False
for o, a in opts:
if o =="-h":
usage()
return 0
if o =="-r":
raw = 1
if o =="--fixed-image":
fixedimage = True
bookDir = args[0]
rv = generateBook(bookDir, raw, fixedimage)
return rv
if __name__ == '__main__':
sys.exit(main(''))

View File

@@ -0,0 +1,145 @@
#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 2.6
class Unbuffered:
def __init__(self, stream):
self.stream = stream
def write(self, data):
self.stream.write(data)
self.stream.flush()
def __getattr__(self, attr):
return getattr(self.stream, attr)
import sys
sys.stdout=Unbuffered(sys.stdout)
import os, getopt
# local routines
import convert2xml
import flatxml2html
import decode_meta
def usage():
print 'Usage: '
print ' '
print ' genxml.py dict0000.dat unencryptedBookDir'
print ' '
def main(argv):
bookDir = ''
if len(argv) == 0:
argv = sys.argv
try:
opts, args = getopt.getopt(argv[1:], "h:")
except getopt.GetoptError, err:
print str(err)
usage()
sys.exit(1)
if len(opts) == 0 and len(args) == 0 :
usage()
sys.exit(1)
for o, a in opts:
if o =="-h":
usage()
sys.exit(0)
bookDir = args[0]
if not os.path.exists(bookDir) :
print "Can not find directory with unencrypted book"
sys.exit(1)
dictFile = os.path.join(bookDir,'dict0000.dat')
if not os.path.exists(dictFile) :
print "Can not find dict0000.dat file"
sys.exit(1)
pageDir = os.path.join(bookDir,'page')
if not os.path.exists(pageDir) :
print "Can not find page directory in unencrypted book"
sys.exit(1)
glyphsDir = os.path.join(bookDir,'glyphs')
if not os.path.exists(glyphsDir) :
print "Can not find glyphs directory in unencrypted book"
sys.exit(1)
otherFile = os.path.join(bookDir,'other0000.dat')
if not os.path.exists(otherFile) :
print "Can not find other0000.dat in unencrypted book"
sys.exit(1)
metaFile = os.path.join(bookDir,'metadata0000.dat')
if not os.path.exists(metaFile) :
print "Can not find metadata0000.dat in unencrypted book"
sys.exit(1)
xmlDir = os.path.join(bookDir,'xml')
if not os.path.exists(xmlDir):
os.makedirs(xmlDir)
print 'Processing ... '
print ' ', 'metadata0000.dat'
fname = os.path.join(bookDir,'metadata0000.dat')
xname = os.path.join(xmlDir, 'metadata.txt')
metastr = decode_meta.getMetaData(fname)
file(xname, 'wb').write(metastr)
print ' ', 'other0000.dat'
fname = os.path.join(bookDir,'other0000.dat')
xname = os.path.join(xmlDir, 'stylesheet.xml')
pargv=[]
pargv.append('convert2xml.py')
pargv.append(dictFile)
pargv.append(fname)
xmlstr = convert2xml.main(pargv)
file(xname, 'wb').write(xmlstr)
filenames = os.listdir(pageDir)
filenames = sorted(filenames)
for filename in filenames:
print ' ', filename
fname = os.path.join(pageDir,filename)
xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
pargv=[]
pargv.append('convert2xml.py')
pargv.append(dictFile)
pargv.append(fname)
xmlstr = convert2xml.main(pargv)
file(xname, 'wb').write(xmlstr)
filenames = os.listdir(glyphsDir)
filenames = sorted(filenames)
for filename in filenames:
print ' ', filename
fname = os.path.join(glyphsDir,filename)
xname = os.path.join(xmlDir, filename.replace('.dat','.xml'))
pargv=[]
pargv.append('convert2xml.py')
pargv.append(dictFile)
pargv.append(fname)
xmlstr = convert2xml.main(pargv)
file(xname, 'wb').write(xmlstr)
print 'Processing Complete'
return 0
if __name__ == '__main__':
sys.exit(main(''))

View File

@@ -0,0 +1,333 @@
# engine to remove drm from Kindle for Mac books
# for personal use for archiving and converting your ebooks
# PLEASE DO NOT PIRATE!
# We want all authors and Publishers, and eBook stores to live long and prosperous lives
#
# it borrows heavily from works by CMBDTC, IHeartCabbages, skindle,
# unswindle, DiapDealer, some_updates and many many others
from __future__ import with_statement
class Unbuffered:
def __init__(self, stream):
self.stream = stream
def write(self, data):
self.stream.write(data)
self.stream.flush()
def __getattr__(self, attr):
return getattr(self.stream, attr)
import sys
sys.stdout=Unbuffered(sys.stdout)
import os, csv, getopt
from struct import pack
from struct import unpack
import zlib
# for handling sub processes
import subprocess
from subprocess import Popen, PIPE, STDOUT
import subasyncio
from subasyncio import Process
#Exception Handling
class K4MDEDRMError(Exception):
pass
class K4MDEDRMFatal(Exception):
pass
#
# crypto routines
#
import hashlib
def MD5(message):
ctx = hashlib.md5()
ctx.update(message)
return ctx.digest()
def SHA1(message):
ctx = hashlib.sha1()
ctx.update(message)
return ctx.digest()
def SHA256(message):
ctx = hashlib.sha256()
ctx.update(message)
return ctx.digest()
# interface to needed routines in openssl's libcrypto
def _load_crypto_libcrypto():
from ctypes import CDLL, byref, POINTER, c_void_p, c_char_p, c_int, c_long, \
Structure, c_ulong, create_string_buffer, addressof, string_at, cast
from ctypes.util import find_library
libcrypto = find_library('crypto')
if libcrypto is None:
raise K4MDEDRMError('libcrypto not found')
libcrypto = CDLL(libcrypto)
AES_MAXNR = 14
c_char_pp = POINTER(c_char_p)
c_int_p = POINTER(c_int)
class AES_KEY(Structure):
_fields_ = [('rd_key', c_long * (4 * (AES_MAXNR + 1))), ('rounds', c_int)]
AES_KEY_p = POINTER(AES_KEY)
def F(restype, name, argtypes):
func = getattr(libcrypto, name)
func.restype = restype
func.argtypes = argtypes
return func
AES_cbc_encrypt = F(None, 'AES_cbc_encrypt',[c_char_p, c_char_p, c_ulong, AES_KEY_p, c_char_p,c_int])
AES_set_decrypt_key = F(c_int, 'AES_set_decrypt_key',[c_char_p, c_int, AES_KEY_p])
PKCS5_PBKDF2_HMAC_SHA1 = F(c_int, 'PKCS5_PBKDF2_HMAC_SHA1',
[c_char_p, c_ulong, c_char_p, c_ulong, c_ulong, c_ulong, c_char_p])
class LibCrypto(object):
def __init__(self):
self._blocksize = 0
self._keyctx = None
self.iv = 0
def set_decrypt_key(self, userkey, iv):
self._blocksize = len(userkey)
if (self._blocksize != 16) and (self._blocksize != 24) and (self._blocksize != 32) :
raise K4MDEDRMError('AES improper key used')
return
keyctx = self._keyctx = AES_KEY()
self.iv = iv
rv = AES_set_decrypt_key(userkey, len(userkey) * 8, keyctx)
if rv < 0:
raise K4MDEDRMError('Failed to initialize AES key')
def decrypt(self, data):
out = create_string_buffer(len(data))
rv = AES_cbc_encrypt(data, out, len(data), self._keyctx, self.iv, 0)
if rv == 0:
raise K4MDEDRMError('AES decryption failed')
return out.raw
def keyivgen(self, passwd):
salt = '16743'
saltlen = 5
passlen = len(passwd)
iter = 0x3e8
keylen = 80
out = create_string_buffer(keylen)
rv = PKCS5_PBKDF2_HMAC_SHA1(passwd, passlen, salt, saltlen, iter, keylen, out)
return out.raw
return LibCrypto
def _load_crypto():
LibCrypto = None
try:
LibCrypto = _load_crypto_libcrypto()
except (ImportError, K4MDEDRMError):
pass
return LibCrypto
LibCrypto = _load_crypto()
#
# Utility Routines
#
# uses a sub process to get the Hard Drive Serial Number using ioreg
# returns with the first found serial number in that class
def GetVolumeSerialNumber():
sernum = os.getenv('MYSERIALNUMBER')
if sernum != None:
return sernum
cmdline = '/usr/sbin/ioreg -l -S -w 0 -r -c AppleAHCIDiskDriver'
cmdline = cmdline.encode(sys.getfilesystemencoding())
p = Process(cmdline, shell=True, bufsize=1, stdin=None, stdout=PIPE, stderr=PIPE, close_fds=False)
poll = p.wait('wait')
results = p.read()
reslst = results.split('\n')
cnt = len(reslst)
bsdname = None
sernum = None
foundIt = False
for j in xrange(cnt):
resline = reslst[j]
pp = resline.find('"Serial Number" = "')
if pp >= 0:
sernum = resline[pp+19:-1]
sernum = sernum.strip()
bb = resline.find('"BSD Name" = "')
if bb >= 0:
bsdname = resline[bb+14:-1]
bsdname = bsdname.strip()
if (bsdname == 'disk0') and (sernum != None):
foundIt = True
break
if not foundIt:
sernum = '9999999999'
return sernum
# uses unix env to get username instead of using sysctlbyname
def GetUserName():
username = os.getenv('USER')
return username
MAX_PATH = 255
#
# start of Kindle specific routines
#
global kindleDatabase
# Various character maps used to decrypt books. Probably supposed to act as obfuscation
charMap1 = "n5Pr6St7Uv8Wx9YzAb0Cd1Ef2Gh3Jk4M"
charMap2 = "ZB0bYyc1xDdW2wEV3Ff7KkPpL8UuGA4gz-Tme9Nn_tHh5SvXCsIiR6rJjQaqlOoM"
charMap3 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
charMap4 = "ABCDEFGHIJKLMNPQRSTUVWXYZ123456789"
# Encode the bytes in data with the characters in map
def encode(data, map):
result = ""
for char in data:
value = ord(char)
Q = (value ^ 0x80) // len(map)
R = value % len(map)
result += map[Q]
result += map[R]
return result
# Hash the bytes in data and then encode the digest with the characters in map
def encodeHash(data,map):
return encode(MD5(data),map)
# Decode the string in data with the characters in map. Returns the decoded bytes
def decode(data,map):
result = ""
for i in range (0,len(data)-1,2):
high = map.find(data[i])
low = map.find(data[i+1])
if (high == -1) or (low == -1) :
break
value = (((high * len(map)) ^ 0x80) & 0xFF) + low
result += pack("B",value)
return result
# implements an Pseudo Mac Version of Windows built-in Crypto routine
def CryptUnprotectData(encryptedData):
sp = GetVolumeSerialNumber() + '!@#' + GetUserName()
passwdData = encode(SHA256(sp),charMap1)
crp = LibCrypto()
key_iv = crp.keyivgen(passwdData)
key = key_iv[0:32]
iv = key_iv[32:48]
crp.set_decrypt_key(key,iv)
cleartext = crp.decrypt(encryptedData)
return cleartext
# Locate and open the .kindle-info file
def openKindleInfo():
home = os.getenv('HOME')
kinfopath = home + '/Library/Application Support/Amazon/Kindle/storage/.kindle-info'
if not os.path.exists(kinfopath):
kinfopath = home + '/Library/Application Support/Amazon/Kindle for Mac/storage/.kindle-info'
if not os.path.exists(kinfopath):
raise K4MDEDRMError('Error: .kindle-info file can not be found')
return open(kinfopath,'r')
# Parse the Kindle.info file and return the records as a list of key-values
def parseKindleInfo():
DB = {}
infoReader = openKindleInfo()
infoReader.read(1)
data = infoReader.read()
items = data.split('[')
for item in items:
splito = item.split(':')
DB[splito[0]] =splito[1]
return DB
# Get a record from the Kindle.info file for the key "hashedKey" (already hashed and encoded). Return the decoded and decrypted record
def getKindleInfoValueForHash(hashedKey):
global kindleDatabase
encryptedValue = decode(kindleDatabase[hashedKey],charMap2)
cleartext = CryptUnprotectData(encryptedValue)
return decode(cleartext, charMap1)
# Get a record from the Kindle.info file for the string in "key" (plaintext). Return the decoded and decrypted record
def getKindleInfoValueForKey(key):
return getKindleInfoValueForHash(encodeHash(key,charMap2))
# Find if the original string for a hashed/encoded string is known. If so return the original string othwise return an empty string.
def findNameForHash(hash):
names = ["kindle.account.tokens","kindle.cookie.item","eulaVersionAccepted","login_date","kindle.token.item","login","kindle.key.item","kindle.name.info","kindle.device.info", "MazamaRandomNumber"]
result = ""
for name in names:
if hash == encodeHash(name, charMap2):
result = name
break
return result
# Print all the records from the kindle.info file (option -i)
def printKindleInfo():
for record in kindleDatabase:
name = findNameForHash(record)
if name != "" :
print (name)
print ("--------------------------")
else :
print ("Unknown Record")
print getKindleInfoValueForHash(record)
print "\n"
#
# PID generation routines
#
# Returns two bit at offset from a bit field
def getTwoBitsFromBitField(bitField,offset):
byteNumber = offset // 4
bitPosition = 6 - 2*(offset % 4)
return ord(bitField[byteNumber]) >> bitPosition & 3
# Returns the six bits at offset from a bit field
def getSixBitsFromBitField(bitField,offset):
offset *= 3
value = (getTwoBitsFromBitField(bitField,offset) <<4) + (getTwoBitsFromBitField(bitField,offset+1) << 2) +getTwoBitsFromBitField(bitField,offset+2)
return value
# 8 bits to six bits encoding from hash to generate PID string
def encodePID(hash):
global charMap3
PID = ""
for position in range (0,8):
PID += charMap3[getSixBitsFromBitField(hash,position)]
return PID
#
# Main
#
def main(argv=sys.argv):
global kindleDatabase
kindleDatabase = None
#
# Read the encrypted database
#
try:
kindleDatabase = parseKindleInfo()
except Exception, message:
print(message)
if kindleDatabase != None :
printKindleInfo()
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@@ -0,0 +1,333 @@
#!/usr/bin/env python
# engine to remove drm from Kindle for Mac and Kindle for PC books
# for personal use for archiving and converting your ebooks
# PLEASE DO NOT PIRATE EBOOKS!
# We want all authors and publishers, and eBook stores to live
# long and prosperous lives but at the same time we just want to
# be able to read OUR books on whatever device we want and to keep
# readable for a long, long time
# This borrows very heavily from works by CMBDTC, IHeartCabbages, skindle,
# unswindle, DarkReverser, ApprenticeAlf, DiapDealer, some_updates
# and many many others
# It can run standalone to convert K4M/K4PC/Mobi files, or it can be installed as a
# plugin for Calibre (http://calibre-ebook.com/about) so that importing
# K4 or Mobi with DRM is no londer a multi-step process.
#
# ***NOTE*** If you are using this script as a calibre plugin for a K4M or K4PC ebook
# then calibre must be installed on the same machine and in the same account as K4PC or K4M
# for the plugin version to function properly.
#
# To create a Calibre plugin, rename this file so that the filename
# ends in '_plugin.py', put it into a ZIP file with all its supporting python routines
# and import that ZIP into Calibre using its plugin configuration GUI.
from __future__ import with_statement
__version__ = '1.4'
class Unbuffered:
def __init__(self, stream):
self.stream = stream
def write(self, data):
self.stream.write(data)
self.stream.flush()
def __getattr__(self, attr):
return getattr(self.stream, attr)
import sys
import os, csv, getopt
import binascii
import zlib
import re
import zlib, zipfile, tempfile, shutil
from struct import pack, unpack, unpack_from
class DrmException(Exception):
pass
if 'calibre' in sys.modules:
inCalibre = True
else:
inCalibre = False
def zipUpDir(myzip, tempdir,localname):
currentdir = tempdir
if localname != "":
currentdir = os.path.join(currentdir,localname)
list = os.listdir(currentdir)
for file in list:
afilename = file
localfilePath = os.path.join(localname, afilename)
realfilePath = os.path.join(currentdir,file)
if os.path.isfile(realfilePath):
myzip.write(realfilePath, localfilePath)
elif os.path.isdir(realfilePath):
zipUpDir(myzip, tempdir, localfilePath)
def usage(progname):
print "Removes DRM protection from K4PC/M, Kindle, Mobi and Topaz ebooks"
print "Usage:"
print " %s [-k <kindle.info>] [-p <pidnums>] [-s <kindleSerialNumbers>] <infile> <outdir> " % progname
#
# Main
#
def main(argv=sys.argv):
import mobidedrm
import topazextract
import kgenpids
progname = os.path.basename(argv[0])
k4 = False
kInfoFiles = []
serials = []
pids = []
print ('K4MobiDeDrm v%(__version__)s '
'provided by the work of many including DiapDealer, SomeUpdates, IHeartCabbages, CMBDTC, Skindle, DarkReverser, ApprenticeAlf, etc .' % globals())
print ' '
try:
opts, args = getopt.getopt(sys.argv[1:], "k:p:s:")
except getopt.GetoptError, err:
print str(err)
usage(progname)
sys.exit(2)
if len(args)<2:
usage(progname)
sys.exit(2)
for o, a in opts:
if o == "-k":
if a == None :
raise DrmException("Invalid parameter for -k")
kInfoFiles.append(a)
if o == "-p":
if a == None :
raise DrmException("Invalid parameter for -p")
pids = a.split(',')
if o == "-s":
if a == None :
raise DrmException("Invalid parameter for -s")
serials = a.split(',')
# try with built in Kindle Info files
k4 = True
infile = args[0]
outdir = args[1]
# handle the obvious cases at the beginning
if not os.path.isfile(infile):
print "Error: Input file does not exist"
return 1
mobi = True
magic3 = file(infile,'rb').read(3)
if magic3 == 'TPZ':
mobi = False
bookname = os.path.splitext(os.path.basename(infile))[0]
if mobi:
mb = mobidedrm.MobiBook(infile)
else:
tempdir = tempfile.mkdtemp()
mb = topazextract.TopazBook(infile, tempdir)
title = mb.getBookTitle()
print "Processing Book: ", title
# build pid list
md1, md2 = mb.getPIDMetaInfo()
pidlst = kgenpids.getPidList(md1, md2, k4, pids, serials, kInfoFiles)
try:
if mobi:
unlocked_file = mb.processBook(pidlst)
else:
mb.processBook(pidlst)
except mobidedrm.DrmException, e:
print " ... not suceessful " + str(e) + "\n"
return 1
except topazextract.TpzDRMError, e:
print str(e)
print " Creating DeBug Full Zip Archive of Book"
zipname = os.path.join(outdir, bookname + '_debug' + '.zip')
myzip = zipfile.ZipFile(zipname,'w',zipfile.ZIP_DEFLATED, False)
zipUpDir(myzip, tempdir, '')
myzip.close()
return 1
if mobi:
outfile = os.path.join(outdir,bookname + '_nodrm' + '.azw')
file(outfile, 'wb').write(unlocked_file)
return 0
# topaz: build up zip archives of results
print " Creating HTML ZIP Archive"
zipname = os.path.join(outdir, bookname + '_nodrm' + '.zip')
myzip1 = zipfile.ZipFile(zipname,'w',zipfile.ZIP_DEFLATED, False)
myzip1.write(os.path.join(tempdir,'book.html'),'book.html')
myzip1.write(os.path.join(tempdir,'book.opf'),'book.opf')
if os.path.isfile(os.path.join(tempdir,'cover.jpg')):
myzip1.write(os.path.join(tempdir,'cover.jpg'),'cover.jpg')
myzip1.write(os.path.join(tempdir,'style.css'),'style.css')
zipUpDir(myzip1, tempdir, 'img')
myzip1.close()
print " Creating SVG ZIP Archive"
zipname = os.path.join(outdir, bookname + '_SVG' + '.zip')
myzip2 = zipfile.ZipFile(zipname,'w',zipfile.ZIP_DEFLATED, False)
myzip2.write(os.path.join(tempdir,'index_svg.xhtml'),'index_svg.xhtml')
zipUpDir(myzip2, tempdir, 'svg')
zipUpDir(myzip2, tempdir, 'img')
myzip2.close()
print " Creating XML ZIP Archive"
zipname = os.path.join(outdir, bookname + '_XML' + '.zip')
myzip3 = zipfile.ZipFile(zipname,'w',zipfile.ZIP_DEFLATED, False)
targetdir = os.path.join(tempdir,'xml')
zipUpDir(myzip3, targetdir, '')
zipUpDir(myzip3, tempdir, 'img')
myzip3.close()
shutil.rmtree(tempdir)
return 0
if __name__ == '__main__':
sys.stdout=Unbuffered(sys.stdout)
sys.exit(main())
if not __name__ == "__main__" and inCalibre:
from calibre.customize import FileTypePlugin
class K4DeDRM(FileTypePlugin):
name = 'K4PC, K4Mac, Kindle Mobi and Topaz DeDRM' # Name of the plugin
description = 'Removes DRM from K4PC and Mac, Kindle Mobi and Topaz files. \
Provided by the work of many including DiapDealer, SomeUpdates, IHeartCabbages, CMBDTC, Skindle, DarkReverser, ApprenticeAlf, etc.'
supported_platforms = ['osx', 'windows', 'linux'] # Platforms this plugin will run on
author = 'DiapDealer, SomeUpdates' # The author of this plugin
version = (0, 1, 7) # The version number of this plugin
file_types = set(['prc','mobi','azw','azw1','tpz']) # The file types that this plugin will be applied to
on_import = True # Run this plugin during the import
priority = 210 # run this plugin before mobidedrm, k4pcdedrm, k4dedrm
def run(self, path_to_ebook):
from calibre.gui2 import is_ok_to_use_qt
from PyQt4.Qt import QMessageBox
from calibre.ptempfile import PersistentTemporaryDirectory
import kgenpids
import zlib
import zipfile
import topazextract
import mobidedrm
k4 = True
pids = []
serials = []
kInfoFiles = []
# Get supplied list of PIDs to try from plugin customization.
customvalues = self.site_customization.split(',')
for customvalue in customvalues:
customvalue = str(customvalue)
customvalue = customvalue.strip()
if len(customvalue) == 10 or len(customvalue) == 8:
pids.append(customvalue)
else :
if len(customvalue) == 16 and customvalue[0] == 'B':
serials.append(customvalue)
else:
print "%s is not a valid Kindle serial number or PID." % str(customvalue)
# Load any kindle info files (*.info) included Calibre's config directory.
try:
# Find Calibre's configuration directory.
confpath = os.path.split(os.path.split(self.plugin_path)[0])[0]
print 'K4MobiDeDRM: Calibre configuration directory = %s' % confpath
files = os.listdir(confpath)
filefilter = re.compile("\.info$", re.IGNORECASE)
files = filter(filefilter.search, files)
if files:
for filename in files:
fpath = os.path.join(confpath, filename)
kInfoFiles.append(fpath)
print 'K4MobiDeDRM: Kindle info file %s found in config folder.' % filename
except IOError:
print 'K4MobiDeDRM: Error reading kindle info files from config directory.'
pass
mobi = True
magic3 = file(path_to_ebook,'rb').read(3)
if magic3 == 'TPZ':
mobi = False
bookname = os.path.splitext(os.path.basename(path_to_ebook))[0]
if mobi:
mb = mobidedrm.MobiBook(path_to_ebook)
else:
tempdir = PersistentTemporaryDirectory()
mb = topazextract.TopazBook(path_to_ebook, tempdir)
title = mb.getBookTitle()
md1, md2 = mb.getPIDMetaInfo()
pidlst = kgenpids.getPidList(md1, md2, k4, pids, serials, kInfoFiles)
try:
if mobi:
unlocked_file = mb.processBook(pidlst)
else:
mb.processBook(pidlst)
except mobidedrm.DrmException:
#if you reached here then no luck raise and exception
if is_ok_to_use_qt():
d = QMessageBox(QMessageBox.Warning, "K4MobiDeDRM Plugin", "Error decoding: %s\n" % path_to_ebook)
d.show()
d.raise_()
d.exec_()
raise Exception("K4MobiDeDRM plugin could not decode the file")
return ""
except topazextract.TpzDRMError:
#if you reached here then no luck raise and exception
if is_ok_to_use_qt():
d = QMessageBox(QMessageBox.Warning, "K4MobiDeDRM Plugin", "Error decoding: %s\n" % path_to_ebook)
d.show()
d.raise_()
d.exec_()
raise Exception("K4MobiDeDRM plugin could not decode the file")
return ""
print "Success!"
if mobi:
of = self.temporary_file(bookname+'.mobi')
of.write(unlocked_file)
of.close()
return of.name
# topaz: build up zip archives of results
print " Creating HTML ZIP Archive"
of = self.temporary_file(bookname + '.zip')
myzip = zipfile.ZipFile(of.name,'w',zipfile.ZIP_DEFLATED, False)
myzip.write(os.path.join(tempdir,'book.html'),'book.html')
myzip.write(os.path.join(tempdir,'book.opf'),'book.opf')
if os.path.isfile(os.path.join(tempdir,'cover.jpg')):
myzip.write(os.path.join(tempdir,'cover.jpg'),'cover.jpg')
myzip.write(os.path.join(tempdir,'style.css'),'style.css')
zipUpDir(myzip, tempdir, 'img')
myzip.close()
return of.name
def customization_help(self, gui=False):
return 'Enter 10 character PIDs and/or Kindle serial numbers, separated by commas.'

View File

@@ -0,0 +1,192 @@
# standlone set of Mac OSX specific routines needed for K4DeDRM
from __future__ import with_statement
import sys
import os
import subprocess
class K4MDrmException(Exception):
pass
# interface to needed routines in openssl's libcrypto
def _load_crypto_libcrypto():
from ctypes import CDLL, byref, POINTER, c_void_p, c_char_p, c_int, c_long, \
Structure, c_ulong, create_string_buffer, addressof, string_at, cast
from ctypes.util import find_library
libcrypto = find_library('crypto')
if libcrypto is None:
raise K4MDrmException('libcrypto not found')
libcrypto = CDLL(libcrypto)
AES_MAXNR = 14
c_char_pp = POINTER(c_char_p)
c_int_p = POINTER(c_int)
class AES_KEY(Structure):
_fields_ = [('rd_key', c_long * (4 * (AES_MAXNR + 1))), ('rounds', c_int)]
AES_KEY_p = POINTER(AES_KEY)
def F(restype, name, argtypes):
func = getattr(libcrypto, name)
func.restype = restype
func.argtypes = argtypes
return func
AES_cbc_encrypt = F(None, 'AES_cbc_encrypt',[c_char_p, c_char_p, c_ulong, AES_KEY_p, c_char_p,c_int])
AES_set_decrypt_key = F(c_int, 'AES_set_decrypt_key',[c_char_p, c_int, AES_KEY_p])
PKCS5_PBKDF2_HMAC_SHA1 = F(c_int, 'PKCS5_PBKDF2_HMAC_SHA1',
[c_char_p, c_ulong, c_char_p, c_ulong, c_ulong, c_ulong, c_char_p])
class LibCrypto(object):
def __init__(self):
self._blocksize = 0
self._keyctx = None
self.iv = 0
def set_decrypt_key(self, userkey, iv):
self._blocksize = len(userkey)
if (self._blocksize != 16) and (self._blocksize != 24) and (self._blocksize != 32) :
raise K4MDrmException('AES improper key used')
return
keyctx = self._keyctx = AES_KEY()
self.iv = iv
rv = AES_set_decrypt_key(userkey, len(userkey) * 8, keyctx)
if rv < 0:
raise K4MDrmException('Failed to initialize AES key')
def decrypt(self, data):
out = create_string_buffer(len(data))
rv = AES_cbc_encrypt(data, out, len(data), self._keyctx, self.iv, 0)
if rv == 0:
raise K4MDrmException('AES decryption failed')
return out.raw
def keyivgen(self, passwd):
salt = '16743'
saltlen = 5
passlen = len(passwd)
iter = 0x3e8
keylen = 80
out = create_string_buffer(keylen)
rv = PKCS5_PBKDF2_HMAC_SHA1(passwd, passlen, salt, saltlen, iter, keylen, out)
return out.raw
return LibCrypto
def _load_crypto():
LibCrypto = None
try:
LibCrypto = _load_crypto_libcrypto()
except (ImportError, K4MDrmException):
pass
return LibCrypto
LibCrypto = _load_crypto()
#
# Utility Routines
#
# Various character maps used to decrypt books. Probably supposed to act as obfuscation
charMap1 = "n5Pr6St7Uv8Wx9YzAb0Cd1Ef2Gh3Jk4M"
charMap2 = "ZB0bYyc1xDdW2wEV3Ff7KkPpL8UuGA4gz-Tme9Nn_tHh5SvXCsIiR6rJjQaqlOoM"
charMap3 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
charMap4 = "ABCDEFGHIJKLMNPQRSTUVWXYZ123456789"
# uses a sub process to get the Hard Drive Serial Number using ioreg
# returns with the serial number of drive whose BSD Name is "disk0"
def GetVolumeSerialNumber():
sernum = os.getenv('MYSERIALNUMBER')
if sernum != None:
return sernum
cmdline = '/usr/sbin/ioreg -l -S -w 0 -r -c AppleAHCIDiskDriver'
cmdline = cmdline.encode(sys.getfilesystemencoding())
p = subprocess.Popen(cmdline, shell=True, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=False)
out1, out2 = p.communicate()
reslst = out1.split('\n')
cnt = len(reslst)
bsdname = None
sernum = None
foundIt = False
for j in xrange(cnt):
resline = reslst[j]
pp = resline.find('"Serial Number" = "')
if pp >= 0:
sernum = resline[pp+19:-1]
sernum = sernum.strip()
bb = resline.find('"BSD Name" = "')
if bb >= 0:
bsdname = resline[bb+14:-1]
bsdname = bsdname.strip()
if (bsdname == 'disk0') and (sernum != None):
foundIt = True
break
if not foundIt:
sernum = '9999999999'
return sernum
# uses unix env to get username instead of using sysctlbyname
def GetUserName():
username = os.getenv('USER')
return username
def encode(data, map):
result = ""
for char in data:
value = ord(char)
Q = (value ^ 0x80) // len(map)
R = value % len(map)
result += map[Q]
result += map[R]
return result
import hashlib
def SHA256(message):
ctx = hashlib.sha256()
ctx.update(message)
return ctx.digest()
# implements an Pseudo Mac Version of Windows built-in Crypto routine
def CryptUnprotectData(encryptedData):
sp = GetVolumeSerialNumber() + '!@#' + GetUserName()
passwdData = encode(SHA256(sp),charMap1)
crp = LibCrypto()
key_iv = crp.keyivgen(passwdData)
key = key_iv[0:32]
iv = key_iv[32:48]
crp.set_decrypt_key(key,iv)
cleartext = crp.decrypt(encryptedData)
return cleartext
# Locate and open the .kindle-info file
def openKindleInfo(kInfoFile=None):
if kInfoFile == None:
home = os.getenv('HOME')
cmdline = 'find "' + home + '/Library/Application Support" -name ".kindle-info"'
cmdline = cmdline.encode(sys.getfilesystemencoding())
p1 = subprocess.Popen(cmdline, shell=True, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=False)
out1, out2 = p1.communicate()
reslst = out1.split('\n')
kinfopath = 'NONE'
cnt = len(reslst)
for j in xrange(cnt):
resline = reslst[j]
pp = resline.find('.kindle-info')
if pp >= 0:
kinfopath = resline
break
if not os.path.exists(kinfopath):
raise K4MDrmException('Error: .kindle-info file can not be found')
return open(kinfopath,'r')
else:
return open(kInfoFile, 'r')

View File

@@ -0,0 +1,105 @@
# K4PC Windows specific routines
from __future__ import with_statement
import sys, os
from ctypes import windll, c_char_p, c_wchar_p, c_uint, POINTER, byref, \
create_unicode_buffer, create_string_buffer, CFUNCTYPE, addressof, \
string_at, Structure, c_void_p, cast
import _winreg as winreg
import traceback
MAX_PATH = 255
kernel32 = windll.kernel32
advapi32 = windll.advapi32
crypt32 = windll.crypt32
# Various character maps used to decrypt books. Probably supposed to act as obfuscation
charMap1 = "n5Pr6St7Uv8Wx9YzAb0Cd1Ef2Gh3Jk4M"
charMap2 = "AaZzB0bYyCc1XxDdW2wEeVv3FfUuG4g-TtHh5SsIiR6rJjQq7KkPpL8lOoMm9Nn_"
charMap3 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
charMap4 = "ABCDEFGHIJKLMNPQRSTUVWXYZ123456789"
class DrmException(Exception):
pass
class DataBlob(Structure):
_fields_ = [('cbData', c_uint),
('pbData', c_void_p)]
DataBlob_p = POINTER(DataBlob)
def GetSystemDirectory():
GetSystemDirectoryW = kernel32.GetSystemDirectoryW
GetSystemDirectoryW.argtypes = [c_wchar_p, c_uint]
GetSystemDirectoryW.restype = c_uint
def GetSystemDirectory():
buffer = create_unicode_buffer(MAX_PATH + 1)
GetSystemDirectoryW(buffer, len(buffer))
return buffer.value
return GetSystemDirectory
GetSystemDirectory = GetSystemDirectory()
def GetVolumeSerialNumber():
GetVolumeInformationW = kernel32.GetVolumeInformationW
GetVolumeInformationW.argtypes = [c_wchar_p, c_wchar_p, c_uint,
POINTER(c_uint), POINTER(c_uint),
POINTER(c_uint), c_wchar_p, c_uint]
GetVolumeInformationW.restype = c_uint
def GetVolumeSerialNumber(path = GetSystemDirectory().split('\\')[0] + '\\'):
vsn = c_uint(0)
GetVolumeInformationW(path, None, 0, byref(vsn), None, None, None, 0)
return str(vsn.value)
return GetVolumeSerialNumber
GetVolumeSerialNumber = GetVolumeSerialNumber()
def GetUserName():
GetUserNameW = advapi32.GetUserNameW
GetUserNameW.argtypes = [c_wchar_p, POINTER(c_uint)]
GetUserNameW.restype = c_uint
def GetUserName():
buffer = create_unicode_buffer(32)
size = c_uint(len(buffer))
while not GetUserNameW(buffer, byref(size)):
buffer = create_unicode_buffer(len(buffer) * 2)
size.value = len(buffer)
return buffer.value.encode('utf-16-le')[::2]
return GetUserName
GetUserName = GetUserName()
def CryptUnprotectData():
_CryptUnprotectData = crypt32.CryptUnprotectData
_CryptUnprotectData.argtypes = [DataBlob_p, c_wchar_p, DataBlob_p,
c_void_p, c_void_p, c_uint, DataBlob_p]
_CryptUnprotectData.restype = c_uint
def CryptUnprotectData(indata, entropy):
indatab = create_string_buffer(indata)
indata = DataBlob(len(indata), cast(indatab, c_void_p))
entropyb = create_string_buffer(entropy)
entropy = DataBlob(len(entropy), cast(entropyb, c_void_p))
outdata = DataBlob()
if not _CryptUnprotectData(byref(indata), None, byref(entropy),
None, None, 0, byref(outdata)):
raise DrmException("Failed to Unprotect Data")
return string_at(outdata.pbData, outdata.cbData)
return CryptUnprotectData
CryptUnprotectData = CryptUnprotectData()
#
# Locate and open the Kindle.info file.
#
def openKindleInfo(kInfoFile=None):
if kInfoFile == None:
regkey = winreg.OpenKey(winreg.HKEY_CURRENT_USER, "Software\\Microsoft\\Windows\\CurrentVersion\\Explorer\\Shell Folders\\")
path = winreg.QueryValueEx(regkey, 'Local AppData')[0]
return open(path+'\\Amazon\\Kindle For PC\\{AMAwzsaPaaZAzmZzZQzgZCAkZ3AjA_AY}\\kindle.info','r')
else:
return open(kInfoFile, 'r')

View File

@@ -0,0 +1,312 @@
#!/usr/bin/env python
from __future__ import with_statement
import sys
import os, csv
import binascii
import zlib
import re
from struct import pack, unpack, unpack_from
class DrmException(Exception):
pass
global kindleDatabase
global charMap1
global charMap2
global charMap3
global charMap4
if sys.platform.startswith('win'):
from k4pcutils import openKindleInfo, CryptUnprotectData, GetUserName, GetVolumeSerialNumber, charMap2
if sys.platform.startswith('darwin'):
from k4mutils import openKindleInfo, CryptUnprotectData, GetUserName, GetVolumeSerialNumber, charMap2
charMap1 = "n5Pr6St7Uv8Wx9YzAb0Cd1Ef2Gh3Jk4M"
charMap3 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
charMap4 = "ABCDEFGHIJKLMNPQRSTUVWXYZ123456789"
# crypto digestroutines
import hashlib
def MD5(message):
ctx = hashlib.md5()
ctx.update(message)
return ctx.digest()
def SHA1(message):
ctx = hashlib.sha1()
ctx.update(message)
return ctx.digest()
# Encode the bytes in data with the characters in map
def encode(data, map):
result = ""
for char in data:
value = ord(char)
Q = (value ^ 0x80) // len(map)
R = value % len(map)
result += map[Q]
result += map[R]
return result
# Hash the bytes in data and then encode the digest with the characters in map
def encodeHash(data,map):
return encode(MD5(data),map)
# Decode the string in data with the characters in map. Returns the decoded bytes
def decode(data,map):
result = ""
for i in range (0,len(data)-1,2):
high = map.find(data[i])
low = map.find(data[i+1])
if (high == -1) or (low == -1) :
break
value = (((high * len(map)) ^ 0x80) & 0xFF) + low
result += pack("B",value)
return result
# Parse the Kindle.info file and return the records as a list of key-values
def parseKindleInfo(kInfoFile):
DB = {}
infoReader = openKindleInfo(kInfoFile)
infoReader.read(1)
data = infoReader.read()
if sys.platform.startswith('win'):
items = data.split('{')
else :
items = data.split('[')
for item in items:
splito = item.split(':')
DB[splito[0]] =splito[1]
return DB
# Get a record from the Kindle.info file for the key "hashedKey" (already hashed and encoded). Return the decoded and decrypted record
def getKindleInfoValueForHash(hashedKey):
global kindleDatabase
global charMap1
global charMap2
encryptedValue = decode(kindleDatabase[hashedKey],charMap2)
if sys.platform.startswith('win'):
return CryptUnprotectData(encryptedValue,"")
else:
cleartext = CryptUnprotectData(encryptedValue)
return decode(cleartext, charMap1)
# Get a record from the Kindle.info file for the string in "key" (plaintext). Return the decoded and decrypted record
def getKindleInfoValueForKey(key):
global charMap2
return getKindleInfoValueForHash(encodeHash(key,charMap2))
# Find if the original string for a hashed/encoded string is known. If so return the original string othwise return an empty string.
def findNameForHash(hash):
global charMap2
names = ["kindle.account.tokens","kindle.cookie.item","eulaVersionAccepted","login_date","kindle.token.item","login","kindle.key.item","kindle.name.info","kindle.device.info", "MazamaRandomNumber"]
result = ""
for name in names:
if hash == encodeHash(name, charMap2):
result = name
break
return result
# Print all the records from the kindle.info file (option -i)
def printKindleInfo():
for record in kindleDatabase:
name = findNameForHash(record)
if name != "" :
print (name)
print ("--------------------------")
else :
print ("Unknown Record")
print getKindleInfoValueForHash(record)
print "\n"
#
# PID generation routines
#
# Returns two bit at offset from a bit field
def getTwoBitsFromBitField(bitField,offset):
byteNumber = offset // 4
bitPosition = 6 - 2*(offset % 4)
return ord(bitField[byteNumber]) >> bitPosition & 3
# Returns the six bits at offset from a bit field
def getSixBitsFromBitField(bitField,offset):
offset *= 3
value = (getTwoBitsFromBitField(bitField,offset) <<4) + (getTwoBitsFromBitField(bitField,offset+1) << 2) +getTwoBitsFromBitField(bitField,offset+2)
return value
# 8 bits to six bits encoding from hash to generate PID string
def encodePID(hash):
global charMap3
PID = ""
for position in range (0,8):
PID += charMap3[getSixBitsFromBitField(hash,position)]
return PID
# Encryption table used to generate the device PID
def generatePidEncryptionTable() :
table = []
for counter1 in range (0,0x100):
value = counter1
for counter2 in range (0,8):
if (value & 1 == 0) :
value = value >> 1
else :
value = value >> 1
value = value ^ 0xEDB88320
table.append(value)
return table
# Seed value used to generate the device PID
def generatePidSeed(table,dsn) :
value = 0
for counter in range (0,4) :
index = (ord(dsn[counter]) ^ value) &0xFF
value = (value >> 8) ^ table[index]
return value
# Generate the device PID
def generateDevicePID(table,dsn,nbRoll):
global charMap4
seed = generatePidSeed(table,dsn)
pidAscii = ""
pid = [(seed >>24) &0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF,(seed>>24) & 0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF]
index = 0
for counter in range (0,nbRoll):
pid[index] = pid[index] ^ ord(dsn[counter])
index = (index+1) %8
for counter in range (0,8):
index = ((((pid[counter] >>5) & 3) ^ pid[counter]) & 0x1f) + (pid[counter] >> 7)
pidAscii += charMap4[index]
return pidAscii
def crc32(s):
return (~binascii.crc32(s,-1))&0xFFFFFFFF
# convert from 8 digit PID to 10 digit PID with checksum
def checksumPid(s):
global charMap4
crc = crc32(s)
crc = crc ^ (crc >> 16)
res = s
l = len(charMap4)
for i in (0,1):
b = crc & 0xff
pos = (b // l) ^ (b % l)
res += charMap4[pos%l]
crc >>= 8
return res
# old kindle serial number to fixed pid
def pidFromSerial(s, l):
global charMap4
crc = crc32(s)
arr1 = [0]*l
for i in xrange(len(s)):
arr1[i%l] ^= ord(s[i])
crc_bytes = [crc >> 24 & 0xff, crc >> 16 & 0xff, crc >> 8 & 0xff, crc & 0xff]
for i in xrange(l):
arr1[i] ^= crc_bytes[i&3]
pid = ""
for i in xrange(l):
b = arr1[i] & 0xff
pid+=charMap4[(b >> 7) + ((b >> 5 & 3) ^ (b & 0x1f))]
return pid
# Parse the EXTH header records and use the Kindle serial number to calculate the book pid.
def getKindlePid(pidlst, rec209, token, serialnum):
if rec209 != None:
# Compute book PID
pidHash = SHA1(serialnum+rec209+token)
bookPID = encodePID(pidHash)
bookPID = checksumPid(bookPID)
pidlst.append(bookPID)
# compute fixed pid for old pre 2.5 firmware update pid as well
bookPID = pidFromSerial(serialnum, 7) + "*"
bookPID = checksumPid(bookPID)
pidlst.append(bookPID)
return pidlst
# Parse the EXTH header records and parse the Kindleinfo
# file to calculate the book pid.
def getK4Pids(pidlst, rec209, token, kInfoFile=None):
global kindleDatabase
global charMap1
kindleDatabase = None
try:
kindleDatabase = parseKindleInfo(kInfoFile)
except Exception, message:
print(message)
pass
if kindleDatabase == None :
return pidlst
# Get the Mazama Random number
MazamaRandomNumber = getKindleInfoValueForKey("MazamaRandomNumber")
# Get the HDD serial
encodedSystemVolumeSerialNumber = encodeHash(GetVolumeSerialNumber(),charMap1)
# Get the current user name
encodedUsername = encodeHash(GetUserName(),charMap1)
# concat, hash and encode to calculate the DSN
DSN = encode(SHA1(MazamaRandomNumber+encodedSystemVolumeSerialNumber+encodedUsername),charMap1)
# Compute the device PID (for which I can tell, is used for nothing).
table = generatePidEncryptionTable()
devicePID = generateDevicePID(table,DSN,4)
devicePID = checksumPid(devicePID)
pidlst.append(devicePID)
# Compute book PID
if rec209 == None:
print "\nNo EXTH record type 209 - Perhaps not a K4 file?"
return pidlst
# Get the kindle account token
kindleAccountToken = getKindleInfoValueForKey("kindle.account.tokens")
# book pid
pidHash = SHA1(DSN+kindleAccountToken+rec209+token)
bookPID = encodePID(pidHash)
bookPID = checksumPid(bookPID)
pidlst.append(bookPID)
# variant 1
pidHash = SHA1(kindleAccountToken+rec209+token)
bookPID = encodePID(pidHash)
bookPID = checksumPid(bookPID)
pidlst.append(bookPID)
# variant 2
pidHash = SHA1(DSN+rec209+token)
bookPID = encodePID(pidHash)
bookPID = checksumPid(bookPID)
pidlst.append(bookPID)
return pidlst
def getPidList(md1, md2, k4, pids, serials, kInfoFiles):
pidlst = []
if k4:
pidlst = getK4Pids(pidlst, md1, md2)
for infoFile in kInfoFiles:
pidlst = getK4Pids(pidlst, md1, md2, infoFile)
for serialnum in serials:
pidlst = getKindlePid(pidlst, md1, md2, serialnum)
for pid in pids:
pidlst.append(pid)
return pidlst

View File

@@ -0,0 +1,361 @@
#!/usr/bin/python
#
# This is a python script. You need a Python interpreter to run it.
# For example, ActiveState Python, which exists for windows.
#
# Changelog
# 0.01 - Initial version
# 0.02 - Huffdic compressed books were not properly decrypted
# 0.03 - Wasn't checking MOBI header length
# 0.04 - Wasn't sanity checking size of data record
# 0.05 - It seems that the extra data flags take two bytes not four
# 0.06 - And that low bit does mean something after all :-)
# 0.07 - The extra data flags aren't present in MOBI header < 0xE8 in size
# 0.08 - ...and also not in Mobi header version < 6
# 0.09 - ...but they are there with Mobi header version 6, header size 0xE4!
# 0.10 - Outputs unencrypted files as-is, so that when run as a Calibre
# import filter it works when importing unencrypted files.
# Also now handles encrypted files that don't need a specific PID.
# 0.11 - use autoflushed stdout and proper return values
# 0.12 - Fix for problems with metadata import as Calibre plugin, report errors
# 0.13 - Formatting fixes: retabbed file, removed trailing whitespace
# and extra blank lines, converted CR/LF pairs at ends of each line,
# and other cosmetic fixes.
# 0.14 - Working out when the extra data flags are present has been problematic
# Versions 7 through 9 have tried to tweak the conditions, but have been
# only partially successful. Closer examination of lots of sample
# files reveals that a confusion has arisen because trailing data entries
# are not encrypted, but it turns out that the multibyte entries
# in utf8 file are encrypted. (Although neither kind gets compressed.)
# This knowledge leads to a simplification of the test for the
# trailing data byte flags - version 5 and higher AND header size >= 0xE4.
# 0.15 - Now outputs 'heartbeat', and is also quicker for long files.
# 0.16 - And reverts to 'done' not 'done.' at the end for unswindle compatibility.
# 0.17 - added modifications to support its use as an imported python module
# both inside calibre and also in other places (ie K4DeDRM tools)
# 0.17a- disabled the standalone plugin feature since a plugin can not import
# a plugin
# 0.18 - It seems that multibyte entries aren't encrypted in a v7 file...
# Removed the disabled Calibre plug-in code
# Permit use of 8-digit PIDs
# 0.19 - It seems that multibyte entries aren't encrypted in a v6 file either.
# 0.20 - Correction: It seems that multibyte entries are encrypted in a v6 file.
# 0.21 - Added support for multiple pids
# 0.22 - revised structure to hold MobiBook as a class to allow an extended interface
__version__ = '0.22'
import sys
class Unbuffered:
def __init__(self, stream):
self.stream = stream
def write(self, data):
self.stream.write(data)
self.stream.flush()
def __getattr__(self, attr):
return getattr(self.stream, attr)
sys.stdout=Unbuffered(sys.stdout)
import struct
import binascii
class DrmException(Exception):
pass
#
# MobiBook Utility Routines
#
# Implementation of Pukall Cipher 1
def PC1(key, src, decryption=True):
sum1 = 0;
sum2 = 0;
keyXorVal = 0;
if len(key)!=16:
print "Bad key length!"
return None
wkey = []
for i in xrange(8):
wkey.append(ord(key[i*2])<<8 | ord(key[i*2+1]))
dst = ""
for i in xrange(len(src)):
temp1 = 0;
byteXorVal = 0;
for j in xrange(8):
temp1 ^= wkey[j]
sum2 = (sum2+j)*20021 + sum1
sum1 = (temp1*346)&0xFFFF
sum2 = (sum2+sum1)&0xFFFF
temp1 = (temp1*20021+1)&0xFFFF
byteXorVal ^= temp1 ^ sum2
curByte = ord(src[i])
if not decryption:
keyXorVal = curByte * 257;
curByte = ((curByte ^ (byteXorVal >> 8)) ^ byteXorVal) & 0xFF
if decryption:
keyXorVal = curByte * 257;
for j in xrange(8):
wkey[j] ^= keyXorVal;
dst+=chr(curByte)
return dst
def checksumPid(s):
letters = "ABCDEFGHIJKLMNPQRSTUVWXYZ123456789"
crc = (~binascii.crc32(s,-1))&0xFFFFFFFF
crc = crc ^ (crc >> 16)
res = s
l = len(letters)
for i in (0,1):
b = crc & 0xff
pos = (b // l) ^ (b % l)
res += letters[pos%l]
crc >>= 8
return res
def getSizeOfTrailingDataEntries(ptr, size, flags):
def getSizeOfTrailingDataEntry(ptr, size):
bitpos, result = 0, 0
if size <= 0:
return result
while True:
v = ord(ptr[size-1])
result |= (v & 0x7F) << bitpos
bitpos += 7
size -= 1
if (v & 0x80) != 0 or (bitpos >= 28) or (size == 0):
return result
num = 0
testflags = flags >> 1
while testflags:
if testflags & 1:
num += getSizeOfTrailingDataEntry(ptr, size - num)
testflags >>= 1
# Check the low bit to see if there's multibyte data present.
# if multibyte data is included in the encryped data, we'll
# have already cleared this flag.
if flags & 1:
num += (ord(ptr[size - num - 1]) & 0x3) + 1
return num
class MobiBook:
def loadSection(self, section):
if (section + 1 == self.num_sections):
endoff = len(self.data_file)
else:
endoff = self.sections[section + 1][0]
off = self.sections[section][0]
return self.data_file[off:endoff]
def __init__(self, infile):
# initial sanity check on file
self.data_file = file(infile, 'rb').read()
self.header = self.data_file[0:78]
if self.header[0x3C:0x3C+8] != 'BOOKMOBI':
raise DrmException("invalid file format")
# build up section offset and flag info
self.num_sections, = struct.unpack('>H', self.header[76:78])
self.sections = []
for i in xrange(self.num_sections):
offset, a1,a2,a3,a4 = struct.unpack('>LBBBB', self.data_file[78+i*8:78+i*8+8])
flags, val = a1, a2<<16|a3<<8|a4
self.sections.append( (offset, flags, val) )
# parse information from section 0
self.sect = self.loadSection(0)
self.records, = struct.unpack('>H', self.sect[0x8:0x8+2])
self.mobi_length, = struct.unpack('>L',self.sect[0x14:0x18])
self.mobi_version, = struct.unpack('>L',self.sect[0x68:0x6C])
print "MOBI header version = %d, length = %d" %(self.mobi_version, self.mobi_length)
self.extra_data_flags = 0
if (self.mobi_length >= 0xE4) and (self.mobi_version >= 5):
self.extra_data_flags, = struct.unpack('>H', self.sect[0xF2:0xF4])
print "Extra Data Flags = %d" % self.extra_data_flags
if self.mobi_version < 7:
# multibyte utf8 data is included in the encryption for mobi_version 6 and below
# so clear that byte so that we leave it to be decrypted.
self.extra_data_flags &= 0xFFFE
# if exth region exists parse it for metadata array
self.meta_array = {}
exth_flag, = struct.unpack('>L', self.sect[0x80:0x84])
exth = ''
if exth_flag & 0x40:
exth = self.sect[16 + self.mobi_length:]
nitems, = struct.unpack('>I', exth[8:12])
pos = 12
for i in xrange(nitems):
type, size = struct.unpack('>II', exth[pos: pos + 8])
content = exth[pos + 8: pos + size]
self.meta_array[type] = content
pos += size
def getBookTitle(self):
title = ''
if 503 in self.meta_array:
title = self.meta_array[503]
else :
toff, tlen = struct.unpack('>II', self.sect[0x54:0x5c])
tend = toff + tlen
title = self.sect[toff:tend]
if title == '':
title = self.header[:32]
title = title.split("\0")[0]
return title
def getPIDMetaInfo(self):
rec209 = None
token = None
if 209 in self.meta_array:
rec209 = self.meta_array[209]
data = rec209
# Parse the 209 data to find the the exth record with the token data.
# The last character of the 209 data points to the record with the token.
# Always 208 from my experience, but I'll leave the logic in case that changes.
for i in xrange(len(data)):
if ord(data[i]) != 0:
if self.meta_array[ord(data[i])] != None:
token = self.meta_array[ord(data[i])]
return rec209, token
def patch(self, off, new):
self.data_file = self.data_file[:off] + new + self.data_file[off+len(new):]
def patchSection(self, section, new, in_off = 0):
if (section + 1 == self.num_sections):
endoff = len(self.data_file)
else:
endoff = self.sections[section + 1][0]
off = self.sections[section][0]
assert off + in_off + len(new) <= endoff
self.patch(off + in_off, new)
def parseDRM(self, data, count, pidlist):
found_key = None
keyvec1 = "\x72\x38\x33\xB0\xB4\xF2\xE3\xCA\xDF\x09\x01\xD6\xE2\xE0\x3F\x96"
for pid in pidlist:
bigpid = pid.ljust(16,'\0')
temp_key = PC1(keyvec1, bigpid, False)
temp_key_sum = sum(map(ord,temp_key)) & 0xff
found_key = None
for i in xrange(count):
verification, size, type, cksum, cookie = struct.unpack('>LLLBxxx32s', data[i*0x30:i*0x30+0x30])
if cksum == temp_key_sum:
cookie = PC1(temp_key, cookie)
ver,flags,finalkey,expiry,expiry2 = struct.unpack('>LL16sLL', cookie)
if verification == ver and (flags & 0x1F) == 1:
found_key = finalkey
break
if found_key != None:
break
if not found_key:
# Then try the default encoding that doesn't require a PID
pid = "00000000"
temp_key = keyvec1
temp_key_sum = sum(map(ord,temp_key)) & 0xff
for i in xrange(count):
verification, size, type, cksum, cookie = struct.unpack('>LLLBxxx32s', data[i*0x30:i*0x30+0x30])
if cksum == temp_key_sum:
cookie = PC1(temp_key, cookie)
ver,flags,finalkey,expiry,expiry2 = struct.unpack('>LL16sLL', cookie)
if verification == ver:
found_key = finalkey
break
return [found_key,pid]
def processBook(self, pidlist):
crypto_type, = struct.unpack('>H', self.sect[0xC:0xC+2])
if crypto_type == 0:
print "This book is not encrypted."
return self.data_file
if crypto_type == 1:
raise DrmException("Cannot decode Mobipocket encryption type 1")
if crypto_type != 2:
raise DrmException("Cannot decode unknown Mobipocket encryption type %d" % crypto_type)
goodpids = []
for pid in pidlist:
if len(pid)==10:
if checksumPid(pid[0:-2]) != pid:
print "Warning: PID " + pid + " has incorrect checksum, should have been "+checksumPid(pid[0:-2])
goodpids.append(pid[0:-2])
elif len(pid)==8:
goodpids.append(pid)
# calculate the keys
drm_ptr, drm_count, drm_size, drm_flags = struct.unpack('>LLLL', self.sect[0xA8:0xA8+16])
if drm_count == 0:
raise DrmException("Not yet initialised with PID. Must be opened with Mobipocket Reader first.")
found_key, pid = self.parseDRM(self.sect[drm_ptr:drm_ptr+drm_size], drm_count, goodpids)
if not found_key:
raise DrmException("No key found. Most likely the correct PID has not been given.")
if pid=="00000000":
print "File has default encryption, no specific PID."
else:
print "File is encoded with PID "+checksumPid(pid)+"."
# kill the drm keys
self.patchSection(0, "\0" * drm_size, drm_ptr)
# kill the drm pointers
self.patchSection(0, "\xff" * 4 + "\0" * 12, 0xA8)
# clear the crypto type
self.patchSection(0, "\0" * 2, 0xC)
# decrypt sections
print "Decrypting. Please wait . . .",
new_data = self.data_file[:self.sections[1][0]]
for i in xrange(1, self.records+1):
data = self.loadSection(i)
extra_size = getSizeOfTrailingDataEntries(data, len(data), self.extra_data_flags)
if i%100 == 0:
print ".",
# print "record %d, extra_size %d" %(i,extra_size)
new_data += PC1(found_key, data[0:len(data) - extra_size])
if extra_size > 0:
new_data += data[-extra_size:]
if self.num_sections > self.records+1:
new_data += self.data_file[self.sections[self.records+1][0]:]
self.data_file = new_data
print "done"
return self.data_file
def getUnencryptedBook(infile,pid):
if not os.path.isfile(infile):
raise DrmException('Input File Not Found')
book = MobiBook(infile)
return book.processBook([pid])
def getUnencryptedBookWithList(infile,pidlist):
if not os.path.isfile(infile):
raise DrmException('Input File Not Found')
book = MobiBook(infile)
return book.processBook(pidlist)
def main(argv=sys.argv):
print ('MobiDeDrm v%(__version__)s. '
'Copyright 2008-2010 The Dark Reverser.' % globals())
if len(argv)<4:
print "Removes protection from Mobipocket books"
print "Usage:"
print " %s <infile> <outfile> <Comma separated list of PIDs to try>" % sys.argv[0]
return 1
else:
infile = argv[1]
outfile = argv[2]
pidlist = argv[3].split(',')
try:
stripped_file = getUnencryptedBookWithList(infile, pidlist)
file(outfile, 'wb').write(stripped_file)
except DrmException, e:
print "Error: %s" % e
return 1
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,27 @@
#!/usr/bin/env python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
import Tkinter
import Tkconstants
# basic scrolled text widget
class ScrolledText(Tkinter.Text):
def __init__(self, master=None, **kw):
self.frame = Tkinter.Frame(master)
self.vbar = Tkinter.Scrollbar(self.frame)
self.vbar.pack(side=Tkconstants.RIGHT, fill=Tkconstants.Y)
kw.update({'yscrollcommand': self.vbar.set})
Tkinter.Text.__init__(self, self.frame, **kw)
self.pack(side=Tkconstants.LEFT, fill=Tkconstants.BOTH, expand=True)
self.vbar['command'] = self.yview
# Copy geometry methods of self.frame without overriding Text
# methods = hack!
text_meths = vars(Tkinter.Text).keys()
methods = vars(Tkinter.Pack).keys() + vars(Tkinter.Grid).keys() + vars(Tkinter.Place).keys()
methods = set(methods).difference(text_meths)
for m in methods:
if m[0] != '_' and m != 'config' and m != 'configure':
setattr(self, m, getattr(self.frame, m))
def __str__(self):
return str(self.frame)

View File

@@ -0,0 +1,243 @@
#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 2.6
import csv
import sys
import os
import getopt
from struct import pack
from struct import unpack
class DocParser(object):
def __init__(self, flatxml, fontsize, ph, pw):
self.flatdoc = flatxml.split('\n')
self.fontsize = int(fontsize)
self.ph = int(ph) * 1.0
self.pw = int(pw) * 1.0
stags = {
'paragraph' : 'p',
'graphic' : '.graphic'
}
attr_val_map = {
'hang' : 'text-indent: ',
'indent' : 'text-indent: ',
'line-space' : 'line-height: ',
'margin-bottom' : 'margin-bottom: ',
'margin-left' : 'margin-left: ',
'margin-right' : 'margin-right: ',
'margin-top' : 'margin-top: ',
'space-after' : 'padding-bottom: ',
}
attr_str_map = {
'align-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
'align-left' : 'text-align: left;',
'align-right' : 'text-align: right;',
'align-justify' : 'text-align: justify;',
'display-inline' : 'display: inline;',
'pos-left' : 'text-align: left;',
'pos-right' : 'text-align: right;',
'pos-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
}
# find tag if within pos to end inclusive
def findinDoc(self, tagpath, pos, end) :
result = None
docList = self.flatdoc
cnt = len(docList)
if end == -1 :
end = cnt
else:
end = min(cnt,end)
foundat = -1
for j in xrange(pos, end):
item = docList[j]
if item.find('=') >= 0:
(name, argres) = item.split('=',1)
else :
name = item
argres = ''
if name.endswith(tagpath) :
result = argres
foundat = j
break
return foundat, result
# return list of start positions for the tagpath
def posinDoc(self, tagpath):
startpos = []
pos = 0
res = ""
while res != None :
(foundpos, res) = self.findinDoc(tagpath, pos, -1)
if res != None :
startpos.append(foundpos)
pos = foundpos + 1
return startpos
def process(self):
classlst = ''
csspage = '.cl-center { text-align: center; margin-left: auto; margin-right: auto; }\n'
csspage += '.cl-right { text-align: right; }\n'
csspage += '.cl-left { text-align: left; }\n'
csspage += '.cl-justify { text-align: justify; }\n'
# generate a list of each <style> starting point in the stylesheet
styleList= self.posinDoc('book.stylesheet.style')
stylecnt = len(styleList)
styleList.append(-1)
# process each style converting what you can
for j in xrange(stylecnt):
start = styleList[j]
end = styleList[j+1]
(pos, tag) = self.findinDoc('style._tag',start,end)
if tag == None :
(pos, tag) = self.findinDoc('style.type',start,end)
# Is this something we know how to convert to css
if tag in self.stags :
# get the style class
(pos, sclass) = self.findinDoc('style.class',start,end)
if sclass != None:
sclass = sclass.replace(' ','-')
sclass = '.cl-' + sclass.lower()
else :
sclass = ''
# check for any "after class" specifiers
(pos, aftclass) = self.findinDoc('style._after_class',start,end)
if aftclass != None:
aftclass = aftclass.replace(' ','-')
aftclass = '.cl-' + aftclass.lower()
else :
aftclass = ''
cssargs = {}
while True :
(pos1, attr) = self.findinDoc('style.rule.attr', start, end)
(pos2, val) = self.findinDoc('style.rule.value', start, end)
if attr == None : break
if (attr == 'display') or (attr == 'pos') or (attr == 'align'):
# handle text based attributess
attr = attr + '-' + val
if attr in self.attr_str_map :
cssargs[attr] = (self.attr_str_map[attr], '')
else :
# handle value based attributes
if attr in self.attr_val_map :
name = self.attr_val_map[attr]
if attr in ('margin-bottom', 'margin-top', 'space-after') :
scale = self.ph
elif attr in ('margin-right', 'indent', 'margin-left', 'hang') :
scale = self.pw
elif attr == 'line-space':
scale = self.fontsize * 2.0
if not ((attr == 'hang') and (int(val) == 0)) :
pv = float(val)/scale
cssargs[attr] = (self.attr_val_map[attr], pv)
keep = True
start = max(pos1, pos2) + 1
# disable all of the after class tags until I figure out how to handle them
if aftclass != "" : keep = False
if keep :
# make sure line-space does not go below 100% or above 300% since
# it can be wacky in some styles
if 'line-space' in cssargs:
seg = cssargs['line-space'][0]
val = cssargs['line-space'][1]
if val < 1.0: val = 1.0
if val > 3.0: val = 3.0
del cssargs['line-space']
cssargs['line-space'] = (self.attr_val_map['line-space'], val)
# handle modifications for css style hanging indents
if 'hang' in cssargs:
hseg = cssargs['hang'][0]
hval = cssargs['hang'][1]
del cssargs['hang']
cssargs['hang'] = (self.attr_val_map['hang'], -hval)
mval = 0
mseg = 'margin-left: '
mval = hval
if 'margin-left' in cssargs:
mseg = cssargs['margin-left'][0]
mval = cssargs['margin-left'][1]
if mval < 0: mval = 0
mval = hval + mval
cssargs['margin-left'] = (mseg, mval)
if 'indent' in cssargs:
del cssargs['indent']
cssline = sclass + ' { '
for key in iter(cssargs):
mseg = cssargs[key][0]
mval = cssargs[key][1]
if mval == '':
cssline += mseg + ' '
else :
aseg = mseg + '%.1f%%;' % (mval * 100.0)
cssline += aseg + ' '
cssline += '}'
if sclass != '' :
classlst += sclass + '\n'
# handle special case of paragraph class used inside chapter heading
# and non-chapter headings
if sclass != '' :
ctype = sclass[4:7]
if ctype == 'ch1' :
csspage += 'h1' + cssline + '\n'
if ctype == 'ch2' :
csspage += 'h2' + cssline + '\n'
if ctype == 'ch3' :
csspage += 'h3' + cssline + '\n'
if ctype == 'h1-' :
csspage += 'h4' + cssline + '\n'
if ctype == 'h2-' :
csspage += 'h5' + cssline + '\n'
if ctype == 'h3_' :
csspage += 'h6' + cssline + '\n'
if cssline != ' { }':
csspage += self.stags[tag] + cssline + '\n'
return csspage, classlst
def convert2CSS(flatxml, fontsize, ph, pw):
print ' ', 'Using font size:',fontsize
print ' ', 'Using page height:', ph
print ' ', 'Using page width:', pw
# create a document parser
dp = DocParser(flatxml, fontsize, ph, pw)
csspage = dp.process()
return csspage

View File

@@ -0,0 +1,149 @@
#!/usr/bin/env python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
import os, sys
import signal
import threading
import subprocess
from subprocess import Popen, PIPE, STDOUT
# **heavily** chopped up and modfied version of asyncproc.py
# to make it actually work on Windows as well as Mac/Linux
# For the original see:
# "http://www.lysator.liu.se/~bellman/download/"
# author is "Thomas Bellman <bellman@lysator.liu.se>"
# available under GPL version 3 or Later
# create an asynchronous subprocess whose output can be collected in
# a non-blocking manner
# What a mess! Have to use threads just to get non-blocking io
# in a cross-platform manner
# luckily all thread use is hidden within this class
class Process(object):
def __init__(self, *params, **kwparams):
if len(params) <= 3:
kwparams.setdefault('stdin', subprocess.PIPE)
if len(params) <= 4:
kwparams.setdefault('stdout', subprocess.PIPE)
if len(params) <= 5:
kwparams.setdefault('stderr', subprocess.PIPE)
self.__pending_input = []
self.__collected_outdata = []
self.__collected_errdata = []
self.__exitstatus = None
self.__lock = threading.Lock()
self.__inputsem = threading.Semaphore(0)
self.__quit = False
self.__process = subprocess.Popen(*params, **kwparams)
if self.__process.stdin:
self.__stdin_thread = threading.Thread(
name="stdin-thread",
target=self.__feeder, args=(self.__pending_input,
self.__process.stdin))
self.__stdin_thread.setDaemon(True)
self.__stdin_thread.start()
if self.__process.stdout:
self.__stdout_thread = threading.Thread(
name="stdout-thread",
target=self.__reader, args=(self.__collected_outdata,
self.__process.stdout))
self.__stdout_thread.setDaemon(True)
self.__stdout_thread.start()
if self.__process.stderr:
self.__stderr_thread = threading.Thread(
name="stderr-thread",
target=self.__reader, args=(self.__collected_errdata,
self.__process.stderr))
self.__stderr_thread.setDaemon(True)
self.__stderr_thread.start()
def pid(self):
return self.__process.pid
def kill(self, signal):
self.__process.send_signal(signal)
# check on subprocess (pass in 'nowait') to act like poll
def wait(self, flag):
if flag.lower() == 'nowait':
rc = self.__process.poll()
else:
rc = self.__process.wait()
if rc != None:
if self.__process.stdin:
self.closeinput()
if self.__process.stdout:
self.__stdout_thread.join()
if self.__process.stderr:
self.__stderr_thread.join()
return self.__process.returncode
def terminate(self):
if self.__process.stdin:
self.closeinput()
self.__process.terminate()
# thread gets data from subprocess stdout
def __reader(self, collector, source):
while True:
data = os.read(source.fileno(), 65536)
self.__lock.acquire()
collector.append(data)
self.__lock.release()
if data == "":
source.close()
break
return
# thread feeds data to subprocess stdin
def __feeder(self, pending, drain):
while True:
self.__inputsem.acquire()
self.__lock.acquire()
if not pending and self.__quit:
drain.close()
self.__lock.release()
break
data = pending.pop(0)
self.__lock.release()
drain.write(data)
# non-blocking read of data from subprocess stdout
def read(self):
self.__lock.acquire()
outdata = "".join(self.__collected_outdata)
del self.__collected_outdata[:]
self.__lock.release()
return outdata
# non-blocking read of data from subprocess stderr
def readerr(self):
self.__lock.acquire()
errdata = "".join(self.__collected_errdata)
del self.__collected_errdata[:]
self.__lock.release()
return errdata
# non-blocking write to stdin of subprocess
def write(self, data):
if self.__process.stdin is None:
raise ValueError("Writing to process with stdin not a pipe")
self.__lock.acquire()
self.__pending_input.append(data)
self.__inputsem.release()
self.__lock.release()
# close stdinput of subprocess
def closeinput(self):
self.__lock.acquire()
self.__quit = True
self.__inputsem.release()
self.__lock.release()

View File

@@ -0,0 +1,434 @@
#!/usr/bin/env python
class Unbuffered:
def __init__(self, stream):
self.stream = stream
def write(self, data):
self.stream.write(data)
self.stream.flush()
def __getattr__(self, attr):
return getattr(self.stream, attr)
import sys
sys.stdout=Unbuffered(sys.stdout)
import os, csv, getopt
import zlib, zipfile, tempfile, shutil
from struct import pack
from struct import unpack
class TpzDRMError(Exception):
pass
# local support routines
import kgenpids
import genbook
#
# Utility routines
#
# Get a 7 bit encoded number from file
def bookReadEncodedNumber(fo):
flag = False
data = ord(fo.read(1))
if data == 0xFF:
flag = True
data = ord(fo.read(1))
if data >= 0x80:
datax = (data & 0x7F)
while data >= 0x80 :
data = ord(fo.read(1))
datax = (datax <<7) + (data & 0x7F)
data = datax
if flag:
data = -data
return data
# Get a length prefixed string from file
def bookReadString(fo):
stringLength = bookReadEncodedNumber(fo)
return unpack(str(stringLength)+"s",fo.read(stringLength))[0]
#
# crypto routines
#
# Context initialisation for the Topaz Crypto
def topazCryptoInit(key):
ctx1 = 0x0CAFFE19E
for keyChar in key:
keyByte = ord(keyChar)
ctx2 = ctx1
ctx1 = ((((ctx1 >>2) * (ctx1 >>7))&0xFFFFFFFF) ^ (keyByte * keyByte * 0x0F902007)& 0xFFFFFFFF )
return [ctx1,ctx2]
# decrypt data with the context prepared by topazCryptoInit()
def topazCryptoDecrypt(data, ctx):
ctx1 = ctx[0]
ctx2 = ctx[1]
plainText = ""
for dataChar in data:
dataByte = ord(dataChar)
m = (dataByte ^ ((ctx1 >> 3) &0xFF) ^ ((ctx2<<3) & 0xFF)) &0xFF
ctx2 = ctx1
ctx1 = (((ctx1 >> 2) * (ctx1 >> 7)) &0xFFFFFFFF) ^((m * m * 0x0F902007) &0xFFFFFFFF)
plainText += chr(m)
return plainText
# Decrypt data with the PID
def decryptRecord(data,PID):
ctx = topazCryptoInit(PID)
return topazCryptoDecrypt(data, ctx)
# Try to decrypt a dkey record (contains the bookPID)
def decryptDkeyRecord(data,PID):
record = decryptRecord(data,PID)
fields = unpack("3sB8sB8s3s",record)
if fields[0] != "PID" or fields[5] != "pid" :
raise TpzDRMError("Didn't find PID magic numbers in record")
elif fields[1] != 8 or fields[3] != 8 :
raise TpzDRMError("Record didn't contain correct length fields")
elif fields[2] != PID :
raise TpzDRMError("Record didn't contain PID")
return fields[4]
# Decrypt all dkey records (contain the book PID)
def decryptDkeyRecords(data,PID):
nbKeyRecords = ord(data[0])
records = []
data = data[1:]
for i in range (0,nbKeyRecords):
length = ord(data[0])
try:
key = decryptDkeyRecord(data[1:length+1],PID)
records.append(key)
except TpzDRMError:
pass
data = data[1+length:]
if len(records) == 0:
raise TpzDRMError("BookKey Not Found")
return records
class TopazBook:
def __init__(self, filename, outdir):
self.fo = file(filename, 'rb')
self.outdir = outdir
self.bookPayloadOffset = 0
self.bookHeaderRecords = {}
self.bookMetadata = {}
self.bookKey = None
magic = unpack("4s",self.fo.read(4))[0]
if magic != 'TPZ0':
raise TpzDRMError("Parse Error : Invalid Header, not a Topaz file")
self.parseTopazHeaders()
self.parseMetadata()
def parseTopazHeaders(self):
def bookReadHeaderRecordData():
# Read and return the data of one header record at the current book file position
# [[offset,decompressedLength,compressedLength],...]
nbValues = bookReadEncodedNumber(self.fo)
values = []
for i in range (0,nbValues):
values.append([bookReadEncodedNumber(self.fo),bookReadEncodedNumber(self.fo),bookReadEncodedNumber(self.fo)])
return values
def parseTopazHeaderRecord():
# Read and parse one header record at the current book file position and return the associated data
# [[offset,decompressedLength,compressedLength],...]
if ord(self.fo.read(1)) != 0x63:
raise TpzDRMError("Parse Error : Invalid Header")
tag = bookReadString(self.fo)
record = bookReadHeaderRecordData()
return [tag,record]
nbRecords = bookReadEncodedNumber(self.fo)
for i in range (0,nbRecords):
result = parseTopazHeaderRecord()
# print result[0], result[1]
self.bookHeaderRecords[result[0]] = result[1]
if ord(self.fo.read(1)) != 0x64 :
raise TpzDRMError("Parse Error : Invalid Header")
self.bookPayloadOffset = self.fo.tell()
def parseMetadata(self):
# Parse the metadata record from the book payload and return a list of [key,values]
self.fo.seek(self.bookPayloadOffset + self.bookHeaderRecords["metadata"][0][0])
tag = bookReadString(self.fo)
if tag != "metadata" :
raise TpzDRMError("Parse Error : Record Names Don't Match")
flags = ord(self.fo.read(1))
nbRecords = ord(self.fo.read(1))
for i in range (0,nbRecords) :
record = [bookReadString(self.fo), bookReadString(self.fo)]
self.bookMetadata[record[0]] = record[1]
return self.bookMetadata
def getPIDMetaInfo(self):
keysRecord = None
KeysRecordRecord = None
if 'keys' in self.bookMetadata:
keysRecord = self.bookMetadata['keys']
keysRecordRecord = self.bookMetadata[keysRecord]
return keysRecord, keysRecordRecord
def getBookTitle(self):
title = ''
if 'Title' in self.bookMetadata:
title = self.bookMetadata['Title']
return title
def setBookKey(self, key):
self.bookKey = key
def getBookPayloadRecord(self, name, index):
# Get a record in the book payload, given its name and index.
# decrypted and decompressed if necessary
encrypted = False
compressed = False
try:
recordOffset = self.bookHeaderRecords[name][index][0]
except:
raise TpzDRMError("Parse Error : Invalid Record, record not found")
self.fo.seek(self.bookPayloadOffset + recordOffset)
tag = bookReadString(self.fo)
if tag != name :
raise TpzDRMError("Parse Error : Invalid Record, record name doesn't match")
recordIndex = bookReadEncodedNumber(self.fo)
if recordIndex < 0 :
encrypted = True
recordIndex = -recordIndex -1
if recordIndex != index :
raise TpzDRMError("Parse Error : Invalid Record, index doesn't match")
if (self.bookHeaderRecords[name][index][2] > 0):
compressed = True
record = self.fo.read(self.bookHeaderRecords[name][index][2])
else:
record = self.fo.read(self.bookHeaderRecords[name][index][1])
if encrypted:
if self.bookKey:
ctx = topazCryptoInit(self.bookKey)
record = topazCryptoDecrypt(record,ctx)
else :
raise TpzDRMError("Error: Attempt to decrypt without bookKey")
if compressed:
record = zlib.decompress(record)
return record
def processBook(self, pidlst):
raw = 0
fixedimage=True
try:
keydata = self.getBookPayloadRecord('dkey', 0)
except TpzDRMError, e:
print "no dkey record found, book may not be encrypted"
print "attempting to extrct files without a book key"
self.createBookDirectory()
self.extractFiles()
print "Successfully Extracted Topaz contents"
rv = genbook.generateBook(self.outdir, raw, fixedimage)
if rv == 0:
print "\nBook Successfully generated"
return rv
# try each pid to decode the file
bookKey = None
for pid in pidlst:
# use 8 digit pids here
pid = pid[0:8]
print "\nTrying: ", pid
bookKeys = []
data = keydata
try:
bookKeys+=decryptDkeyRecords(data,pid)
except TpzDRMError, e:
pass
else:
bookKey = bookKeys[0]
print "Book Key Found!"
break
if not bookKey:
raise TpzDRMError('Decryption Unsucessful; No valid pid found')
self.setBookKey(bookKey)
self.createBookDirectory()
self.extractFiles()
print "Successfully Extracted Topaz contents"
rv = genbook.generateBook(self.outdir, raw, fixedimage)
if rv == 0:
print "\nBook Successfully generated"
return rv
def createBookDirectory(self):
outdir = self.outdir
# create output directory structure
if not os.path.exists(outdir):
os.makedirs(outdir)
destdir = os.path.join(outdir,'img')
if not os.path.exists(destdir):
os.makedirs(destdir)
destdir = os.path.join(outdir,'color_img')
if not os.path.exists(destdir):
os.makedirs(destdir)
destdir = os.path.join(outdir,'page')
if not os.path.exists(destdir):
os.makedirs(destdir)
destdir = os.path.join(outdir,'glyphs')
if not os.path.exists(destdir):
os.makedirs(destdir)
def extractFiles(self):
outdir = self.outdir
for headerRecord in self.bookHeaderRecords:
name = headerRecord
if name != "dkey" :
ext = '.dat'
if name == 'img' : ext = '.jpg'
if name == 'color' : ext = '.jpg'
print "\nProcessing Section: %s " % name
for index in range (0,len(self.bookHeaderRecords[name])) :
fnum = "%04d" % index
fname = name + fnum + ext
destdir = outdir
if name == 'img':
destdir = os.path.join(outdir,'img')
if name == 'color':
destdir = os.path.join(outdir,'color_img')
if name == 'page':
destdir = os.path.join(outdir,'page')
if name == 'glyphs':
destdir = os.path.join(outdir,'glyphs')
outputFile = os.path.join(destdir,fname)
print ".",
record = self.getBookPayloadRecord(name,index)
if record != '':
file(outputFile, 'wb').write(record)
print " "
def zipUpDir(myzip, tempdir,localname):
currentdir = tempdir
if localname != "":
currentdir = os.path.join(currentdir,localname)
list = os.listdir(currentdir)
for file in list:
afilename = file
localfilePath = os.path.join(localname, afilename)
realfilePath = os.path.join(currentdir,file)
if os.path.isfile(realfilePath):
myzip.write(realfilePath, localfilePath)
elif os.path.isdir(realfilePath):
zipUpDir(myzip, tempdir, localfilePath)
def usage(progname):
print "Removes DRM protection from Topaz ebooks and extract the contents"
print "Usage:"
print " %s [-k <kindle.info>] [-p <pidnums>] [-s <kindleSerialNumbers>] <infile> <outdir> " % progname
# Main
def main(argv=sys.argv):
progname = os.path.basename(argv[0])
k4 = False
pids = []
serials = []
kInfoFiles = []
try:
opts, args = getopt.getopt(sys.argv[1:], "k:p:s:")
except getopt.GetoptError, err:
print str(err)
usage(progname)
return 1
if len(args)<2:
usage(progname)
return 1
for o, a in opts:
if o == "-k":
if a == None :
print "Invalid parameter for -k"
return 1
kInfoFiles.append(a)
if o == "-p":
if a == None :
print "Invalid parameter for -p"
return 1
pids = a.split(',')
if o == "-s":
if a == None :
print "Invalid parameter for -s"
return 1
serials = a.split(',')
k4 = True
infile = args[0]
outdir = args[1]
if not os.path.isfile(infile):
print "Input File Does Not Exist"
return 1
bookname = os.path.splitext(os.path.basename(infile))[0]
tempdir = tempfile.mkdtemp()
tb = TopazBook(infile, tempdir)
title = tb.getBookTitle()
print "Processing Book: ", title
keysRecord, keysRecordRecord = tb.getPIDMetaInfo()
pidlst = kgenpids.getPidList(keysRecord, keysRecordRecord, k4, pids, serials, kInfoFiles)
try:
tb.processBook(pidlst)
except TpzDRMError, e:
print str(e)
print " Creating DeBug Full Zip Archive of Book"
zipname = os.path.join(outdir, bookname + '_debug' + '.zip')
myzip = zipfile.ZipFile(zipname,'w',zipfile.ZIP_DEFLATED, False)
zipUpDir(myzip, tempdir, '')
myzip.close()
return 1
print " Creating HTML ZIP Archive"
zipname = os.path.join(outdir, bookname + '_nodrm' + '.zip')
myzip1 = zipfile.ZipFile(zipname,'w',zipfile.ZIP_DEFLATED, False)
myzip1.write(os.path.join(tempdir,'book.html'),'book.html')
myzip1.write(os.path.join(tempdir,'book.opf'),'book.opf')
if os.path.isfile(os.path.join(tempdir,'cover.jpg')):
myzip1.write(os.path.join(tempdir,'cover.jpg'),'cover.jpg')
myzip1.write(os.path.join(tempdir,'style.css'),'style.css')
zipUpDir(myzip1, tempdir, 'img')
myzip1.close()
print " Creating SVG ZIP Archive"
zipname = os.path.join(outdir, bookname + '_SVG' + '.zip')
myzip2 = zipfile.ZipFile(zipname,'w',zipfile.ZIP_DEFLATED, False)
myzip2.write(os.path.join(tempdir,'index_svg.xhtml'),'index_svg.xhtml')
zipUpDir(myzip2, tempdir, 'svg')
zipUpDir(myzip2, tempdir, 'img')
myzip2.close()
print " Creating XML ZIP Archive"
zipname = os.path.join(outdir, bookname + '_XML' + '.zip')
myzip3 = zipfile.ZipFile(zipname,'w',zipfile.ZIP_DEFLATED, False)
targetdir = os.path.join(tempdir,'xml')
zipUpDir(myzip3, targetdir, '')
zipUpDir(myzip3, tempdir, 'img')
myzip3.close()
shutil.rmtree(tempdir)
return 0
if __name__ == '__main__':
sys.exit(main())