From 36b28765dd9af34f1be3c639c782d32a1ea4911e Mon Sep 17 00:00:00 2001 From: Florian Bach Date: Sun, 3 Oct 2021 10:30:45 +0200 Subject: [PATCH] v0.0.8: PDF fixes & ACSM file type config --- calibre-plugin/__init__.py | 78 +++++++---- calibre-plugin/fulfill.py | 52 ++++--- calibre-plugin/libadobe.py | 34 +++++ calibre-plugin/libpdf.py | 277 ++++++++++++++++++++++++++----------- 4 files changed, 309 insertions(+), 132 deletions(-) diff --git a/calibre-plugin/__init__.py b/calibre-plugin/__init__.py index c7fb96c..75e88cb 100644 --- a/calibre-plugin/__init__.py +++ b/calibre-plugin/__init__.py @@ -12,10 +12,11 @@ # v0.0.5: Bugfix: DeDRM plugin was also executed if it's installed but disabled. # v0.0.6: First PDF support, allow importing previously exported activation data. # v0.0.7: More PDF logging, PDF reading in latin-1, MacOS locale bugfix +# v0.0.8: More PDF bugfixes, support unlimited PDF file sizes, tell Calibre ACSMs are books. from calibre.customize import FileTypePlugin # type: ignore -__version__ = '0.0.7' +__version__ = '0.0.8' PLUGIN_NAME = "DeACSM" PLUGIN_VERSION_TUPLE = tuple([int(x) for x in __version__.split(".")]) @@ -24,7 +25,7 @@ PLUGIN_VERSION = ".".join([str(x)for x in PLUGIN_VERSION_TUPLE]) from calibre.utils.config import config_dir # type: ignore -import os, shutil, traceback, sys +import os, shutil, traceback, sys, time import zipfile from lxml import etree @@ -49,6 +50,18 @@ class DeACSM(FileTypePlugin): """ try: + + # Patch Calibre to consider "ACSM" a book. This makes ACSM files show up + # in the "Add Book" file selection, and it also makes the auto-add feature useable. + try: + from calibre.ebooks import BOOK_EXTENSIONS + if ("acsm" not in BOOK_EXTENSIONS): + BOOK_EXTENSIONS.append("acsm") + except: + print("{0} v{1}: Couldn't add ACSM to book extension list:".format(PLUGIN_NAME, PLUGIN_VERSION)) + traceback.print_exc() + + self.pluginsdir = os.path.join(config_dir,"plugins") if not os.path.exists(self.pluginsdir): os.mkdir(self.pluginsdir) @@ -167,21 +180,21 @@ class DeACSM(FileTypePlugin): try: - from calibre_plugins.deacsm.libadobe import sendHTTPRequest + from calibre_plugins.deacsm.libadobe import sendHTTPRequest_DL2FILE from calibre_plugins.deacsm.libadobeFulfill import buildRights, fulfill except: try: - from libadobe import sendHTTPRequest + from libadobe import sendHTTPRequest_DL2FILE from libadobeFulfill import buildRights, fulfill except: print("{0} v{1}: Error while importing Fulfillment stuff".format(PLUGIN_NAME, PLUGIN_VERSION)) traceback.print_exc() try: - from calibre_plugins.deacsm.libpdf import patch_drm_into_pdf, prepare_string_from_xml + from calibre_plugins.deacsm.libpdf import patch_drm_into_pdf except: try: - from libpdf import patch_drm_into_pdf, prepare_string_from_xml + from libpdf import patch_drm_into_pdf except: print("{0} v{1}: Error while importing PDF patch".format(PLUGIN_NAME, PLUGIN_VERSION)) traceback.print_exc() @@ -190,9 +203,7 @@ class DeACSM(FileTypePlugin): adobe_fulfill_response = etree.fromstring(replyData) NSMAP = { "adept" : "http://ns.adobe.com/adept" } adNS = lambda tag: '{%s}%s' % ('http://ns.adobe.com/adept', tag) - adDC = lambda tag: '{%s}%s' % ('http://purl.org/dc/elements/1.1/', tag) - metadata_node = adobe_fulfill_response.find("./%s/%s/%s" % (adNS("fulfillmentResult"), adNS("resourceItemInfo"), adNS("metadata"))) download_url = adobe_fulfill_response.find("./%s/%s/%s" % (adNS("fulfillmentResult"), adNS("resourceItemInfo"), adNS("src"))).text license_token_node = adobe_fulfill_response.find("./%s/%s/%s" % (adNS("fulfillmentResult"), adNS("resourceItemInfo"), adNS("licenseToken"))) @@ -205,9 +216,25 @@ class DeACSM(FileTypePlugin): # Download eBook: print("{0} v{1}: Loading book from {2}".format(PLUGIN_NAME, PLUGIN_VERSION, download_url)) - book_content = sendHTTPRequest(download_url) + filename_tmp = self.temporary_file(".blob").name + + dl_start_time = int(time.time() * 1000) + ret = sendHTTPRequest_DL2FILE(download_url, filename_tmp) + dl_end_time = int(time.time() * 1000) + + print("Download took %d ms (HTTP %d)" % (dl_end_time - dl_start_time, ret)) + + if (ret != 200): + print("{0} v{1}: Download failed with error {2}".format(PLUGIN_NAME, PLUGIN_VERSION, ret)) + return None + filetype = ".bin" - + + book_content = None + + with open(filename_tmp, "rb") as f: + book_content = f.read(10) + if (book_content.startswith(b"PK")): print("That's a ZIP file -> EPUB") filetype = ".epub" @@ -217,23 +244,9 @@ class DeACSM(FileTypePlugin): filename = self.temporary_file(filetype).name - author = "None" - title = "None" - - try: - title = metadata_node.find("./%s" % (adDC("title"))).text - author = metadata_node.find("./%s" % (adDC("creator"))).text + # Move to file name with matching extension + shutil.move(filename_tmp, filename) - title = title.replace("(", "").replace(")", "").replace("/", "") - author = author.replace("(", "").replace(")", "").replace("/", "") - - except: - pass - - # Store book: - f = open(filename, "wb") - f.write(book_content) - f.close() if filetype == ".epub": # Store EPUB rights / encryption stuff @@ -244,10 +257,19 @@ class DeACSM(FileTypePlugin): return filename elif filetype == ".pdf": + adobe_fulfill_response = etree.fromstring(rights_xml_str) + NSMAP = { "adept" : "http://ns.adobe.com/adept" } + adNS = lambda tag: '{%s}%s' % ('http://ns.adobe.com/adept', tag) + resource = adobe_fulfill_response.find("./%s/%s" % (adNS("licenseToken"), adNS("resource"))).text + print("{0} v{1}: Downloaded PDF, adding encryption config ...".format(PLUGIN_NAME, PLUGIN_VERSION)) pdf_tmp_file = self.temporary_file(filetype).name - patch_drm_into_pdf(filename, prepare_string_from_xml(rights_xml_str, title, author), pdf_tmp_file) - print("{0} v{1}: File successfully fulfilled ...".format(PLUGIN_NAME, PLUGIN_VERSION)) + ret = patch_drm_into_pdf(filename, rights_xml_str, pdf_tmp_file, resource) + if (ret): + print("{0} v{1}: File successfully fulfilled ...".format(PLUGIN_NAME, PLUGIN_VERSION)) + else: + print("{0} v{1}: There was an error patching the PDF file.".format(PLUGIN_NAME, PLUGIN_VERSION)) + return pdf_tmp_file else: print("{0} v{1}: Error: Unsupported file type ...".format(PLUGIN_NAME, PLUGIN_VERSION)) diff --git a/calibre-plugin/fulfill.py b/calibre-plugin/fulfill.py index 34fa82b..6b0e4ee 100644 --- a/calibre-plugin/fulfill.py +++ b/calibre-plugin/fulfill.py @@ -7,7 +7,7 @@ This is an experimental Python version of libgourou. # pyright: reportUndefinedVariable=false -import sys, os +import sys, os, time, shutil if sys.version_info[0] < 3: print("This script requires Python 3.") exit(1) @@ -15,9 +15,9 @@ if sys.version_info[0] < 3: import zipfile from lxml import etree -from libadobe import sendHTTPRequest +from libadobe import sendHTTPRequest_DL2FILE from libadobeFulfill import buildRights, fulfill -from libpdf import patch_drm_into_pdf, prepare_string_from_xml +from libpdf import patch_drm_into_pdf FILE_DEVICEKEY = "devicesalt" FILE_DEVICEXML = "device.xml" @@ -48,28 +48,31 @@ def download(replyData): exit(1) book_name = None - author = "None" - title = "None" + try: book_name = metadata_node.find("./%s" % (adDC("title"))).text except: book_name = "Book" - try: - title = metadata_node.find("./%s" % (adDC("title"))).text - author = metadata_node.find("./%s" % (adDC("creator"))).text - - title = title.replace("(", "").replace(")", "").replace("/", "") - author = author.replace("(", "").replace(")", "").replace("/", "") - - except: - pass # Download eBook: print(download_url) - book_content = sendHTTPRequest(download_url) + filename_tmp = book_name + ".tmp" + + dl_start_time = int(time.time() * 1000) + ret = sendHTTPRequest_DL2FILE(download_url, filename_tmp) + dl_end_time = int(time.time() * 1000) + print("Download took %d milliseconds" % (dl_end_time - dl_start_time)) + + if (ret != 200): + print("Download failed with error %d" % (ret)) + exit() + + with open(filename_tmp, "rb") as f: + book_content = f.read(10) + filetype = ".bin" if (book_content.startswith(b"PK")): @@ -80,11 +83,7 @@ def download(replyData): filetype = ".pdf" filename = book_name + filetype - - # Store book: - f = open(filename, "wb") - f.write(book_content) - f.close() + shutil.move(filename_tmp, filename) if filetype == ".epub": # Store EPUB rights / encryption stuff @@ -97,11 +96,20 @@ def download(replyData): elif filetype == ".pdf": print("Successfully downloaded PDF, patching encryption ...") + + adobe_fulfill_response = etree.fromstring(rights_xml_str) + NSMAP = { "adept" : "http://ns.adobe.com/adept" } + adNS = lambda tag: '{%s}%s' % ('http://ns.adobe.com/adept', tag) + resource = adobe_fulfill_response.find("./%s/%s" % (adNS("licenseToken"), adNS("resource"))).text os.rename(filename, "tmp_" + filename) - patch_drm_into_pdf("tmp_" + filename, prepare_string_from_xml(rights_xml_str, author, title), filename) + ret = patch_drm_into_pdf("tmp_" + filename, rights_xml_str, filename, resource) os.remove("tmp_" + filename) - print("File successfully fulfilled to " + filename) + if (ret): + print("File successfully fulfilled to " + filename) + else: + print("Errors occurred while patching " + filename) + exit(1) exit(0) else: print("Error: Weird filetype") diff --git a/calibre-plugin/libadobe.py b/calibre-plugin/libadobe.py index 4f75912..5736827 100644 --- a/calibre-plugin/libadobe.py +++ b/calibre-plugin/libadobe.py @@ -121,6 +121,40 @@ def makeFingerprint(serial: str): ############################################## HTTP stuff: +def sendHTTPRequest_DL2FILE(URL: str, outputfile: str): + headers = { + "Accept": "*/*", + "User-Agent": "book2png", + } + req = urllib.request.Request(url=URL, headers=headers) + handler = urllib.request.urlopen(req) + + chunksize = 16 * 1024 + + ret_code = handler.getcode() + + + loc = None + try: + loc = req.headers.get("Location") + except: + pass + + if loc is not None: + return sendHTTPRequest_DL2FILE(loc) + + if ret_code != 200: + return ret_code + + with open(outputfile, "wb") as f: + while True: + chunk = handler.read(chunksize) + if not chunk: + break + f.write(chunk) + + return 200 + def sendHTTPRequest_getSimple(URL: str): headers = { diff --git a/calibre-plugin/libpdf.py b/calibre-plugin/libpdf.py index 82721af..fbb8c49 100644 --- a/calibre-plugin/libpdf.py +++ b/calibre-plugin/libpdf.py @@ -1,112 +1,221 @@ -import os, zlib, base64 -from lxml import etree +import os, zlib, base64, time +class BackwardReader: -def read_reverse_order(file_name): - # Open file for reading in binary mode - with open(file_name, 'rb') as read_obj: - # Move the cursor to the end of the file - read_obj.seek(0, os.SEEK_END) - # Get the current position of pointer i.e eof - pointer_location = read_obj.tell() - # Create a buffer to keep the last read line + def __init__(self, file): + self.file = file + + def readlines(self): + BLKSIZE = 4096 + # Move reader to the end of file + self.file.seek(0, os.SEEK_END) buffer = bytearray() - # Loop till pointer reaches the top of the file - while pointer_location >= 0: - # Move the file pointer to the location pointed by pointer_location - read_obj.seek(pointer_location) - # Shift pointer location by -1 - pointer_location = pointer_location -1 - # read that byte / character - new_byte = read_obj.read(1) - # If the read byte is new line character then it means one line is read - if new_byte == b'\n': - # Fetch the line from buffer and yield it - yield buffer.decode("latin-1")[::-1] - # Reinitialize the byte array to save next line - buffer = bytearray() + + while True: + pos_newline = buffer.rfind(bytes([0x0a])) + # Get the current position of the reader + current_pos = self.file.tell() + if pos_newline != -1: + # Newline is found + line = buffer[pos_newline+1:] + buffer = buffer[:pos_newline] + yield line.decode("latin-1") + elif current_pos: + # Need to fill the buffer + to_read = min(BLKSIZE, current_pos) + self.file.seek(current_pos-to_read, 0) + buffer = self.file.read(to_read) + buffer + self.file.seek(current_pos-to_read, 0) + if current_pos is to_read: + buffer = bytes([0x0a]) + buffer else: - # If last read character is not eol then add it in buffer - buffer.extend(new_byte) - # As file is read completely, if there is still data in buffer, then its the first line. - if len(buffer) > 0: - # Yield the first line too - yield buffer.decode("latin-1")[::-1] + # Start of file + return + + + +def trim_encrypt_string(encrypt): + + string_list = list(encrypt) + strlen = len(encrypt) + + i = 0 + bracket_count = 0 + while (i < strlen): + if string_list[i] == "<" and string_list[i+1] == "<": + bracket_count += 1 + + if string_list[i] == ">" and string_list[i+1] == ">": + bracket_count -= 1 + + if bracket_count == 0: + break + + i = i + 1 + + len_to_use = i+2 + + return encrypt[0:len_to_use] + +def cleanup_encrypt_element(element): + + if element.startswith("ID[<"): + element = element.replace("><", "> <") + + element = ' '.join(element.split()) + element = element.replace("[ ", "[").replace("] ", "]") + + return element + + + def deflate_and_base64_encode( string_val ): zlibbed_str = zlib.compress( string_val ) compressed_string = zlibbed_str[2:-4] return base64.b64encode( compressed_string ) -def prepare_string_from_xml(xmlstring, title, author): - b64data = deflate_and_base64_encode(xmlstring.encode("utf-8")).decode("utf-8") +def update_ebx_with_keys(ebx_data, adept_license, ebx_bookid): - adobe_fulfill_response = etree.fromstring(xmlstring) - NSMAP = { "adept" : "http://ns.adobe.com/adept" } - adNS = lambda tag: '{%s}%s' % ('http://ns.adobe.com/adept', tag) - resource = adobe_fulfill_response.find("./%s/%s" % (adNS("licenseToken"), adNS("resource"))).text + b64data = deflate_and_base64_encode(adept_license.encode("utf-8")).decode("utf-8") - return "<>" % (title, author, resource, resource, b64data) + ebx_new = ebx_data[:-2] + ebx_new += "/EBX_BOOKID(%s)/ADEPT_LICENSE(%s)>>" % (ebx_bookid, b64data) -def patch_drm_into_pdf(filename_in, drm_string, filename_out): + return ebx_new - ORIG_FILE = filename_in + +def find_ebx(filename_in): + find_ebx_start = int(time.time() * 1000) + i = 0 + + fl = open(filename_in, "rb") + br = BackwardReader(fl) + + for line in br.readlines(): + i = i + 1 + if "/EBX_HANDLER/" in line: + find_ebx_end = int(time.time() * 1000) + print("Found EBX after %d attempts - took %d ms" % (i, find_ebx_end - find_ebx_start)) + return line + + find_ebx_end = int(time.time() * 1000) + print("Error: Did not find EBX_HANDLER - took %d ms" % (find_ebx_end - find_ebx_start)) + return None + +def find_enc(filename_in): + find_enc_start = int(time.time() * 1000) + i = 0 + + fl = open(filename_in, "rb") + br = BackwardReader(fl) + + for line in br.readlines(): + i = i + 1 + if "R/Encrypt" in line and "R/ID" in line: + find_enc_end = int(time.time() * 1000) + print("Found ENC after %d attempts - took %d ms" % (i, find_enc_end - find_enc_start)) + return line + + find_enc_end = int(time.time() * 1000) + print("Error: Did not find ENC - took %d ms" % (find_enc_end - find_enc_start)) + return None + + + +def patch_drm_into_pdf(filename_in, adept_license_string, filename_out, ebx_bookid): + + drm_start_time = int(time.time() * 1000) trailer = "" trailer_idx = 0 - print("DRM data is %s" % (drm_string)) + startxref_offset = 0 + prevline = "" - for line in read_reverse_order(ORIG_FILE): + + fl = open(filename_in, "rb") + br = BackwardReader(fl) + + print("Searching for startxref ...") + for line in br.readlines(): trailer_idx += 1 trailer = line + "\n" + trailer - print("DEBUG: pdfdata[%d] = %s" % (trailer_idx, line)) - if (trailer_idx == 20): - print("trailer_idx is very large (%d). Usually it's 10 or less. File might be corrupted." % trailer_idx) - if (line == "trailer"): - print("Found trailer at idx %d" % (trailer_idx)) + + print ("LINE: " + line) + + if (trailer_idx > 10): + print("Took more than 10 attempts to find startxref ...") + return False + + if (line == "startxref"): + startxref_offset = int(prevline) + print("Got startxref: %d" % (startxref_offset)) break + prevline = line + + r_encrypt_offs1 = 0 r_encrypt_offs2 = 0 - root_str = None - next_startxref = False - startxref = None - for line in trailer.split('\n'): - #print(line) - if ("R/Encrypt" in line): - root_str = line - line_split = line.split(' ') + encrypt = None + + + encrypt = find_enc(filename_in) + if encrypt is None: + print("Error, enc not found") + return False + + line_split = encrypt.split(' ') + next = 0 + for element in line_split: + if element == "R/Encrypt": + next = 2 + continue + if next == 2: + r_encrypt_offs1 = element + next = 1 + continue + if next == 1: + r_encrypt_offs2 = element next = 0 - for element in line_split: - if element == "R/Encrypt": - next = 2 - continue - if next == 2: - r_encrypt_offs1 = element - next = 1 - continue - if next == 1: - r_encrypt_offs2 = element - next = 0 - continue - if "startxref" in line: - next_startxref = True - continue - if next_startxref: - startxref = line - next_startxref = False continue + + + # read EBX element: + ebx_elem = find_ebx(filename_in) + + if (ebx_elem is None): + print("Err: EBX is None") + return False + + + print("") + print("") + print("Encryption handler:") + print(encrypt) + print("EBX handler:") + print(ebx_elem) + + encrypt = trim_encrypt_string(encrypt) + + print("Trimmed encryption handler:") + print(encrypt) + + ebx_elem = update_ebx_with_keys(ebx_elem, adept_license_string, ebx_bookid) + + print("Updated EBX handler not logged due to sensitive data") + #print(ebx_elem) - filesize_str = str(os.path.getsize(ORIG_FILE)) + filesize_str = str(os.path.getsize(filename_in)) filesize_pad = filesize_str.zfill(10) additional_data = "\r" additional_data += r_encrypt_offs1 + " " + r_encrypt_offs2 + " " + "obj" + "\r" - additional_data += drm_string + additional_data += ebx_elem additional_data += "\r" additional_data += "endobj" @@ -117,36 +226,40 @@ def patch_drm_into_pdf(filename_in, drm_string, filename_out): additional_data += "trailer" additional_data += "\r" - arr_root_str = root_str.split('/') + arr_root_str = encrypt.split('/') did_prev = False for elem in arr_root_str: if elem.startswith("Prev"): did_prev = True - additional_data += "Prev " + startxref + additional_data += "Prev " + str(startxref_offset) #print("Replacing prev from '%s' to '%s'" % (elem, "Prev " + startxref)) - elif elem.startswith("ID[<"): - additional_data += elem.replace("><", "> <") else: - additional_data += elem + additional_data += cleanup_encrypt_element(elem) additional_data += "/" if not did_prev: # remove two >> at end additional_data = additional_data[:-3] - additional_data += "/Prev " + startxref + ">>" + "/" + additional_data += "/Prev " + str(startxref_offset) + ">>" + "/" #print("Faking Prev %s" % startxref) additional_data = additional_data[:-1] additional_data += "\r" + "startxref\r" + str(ptr) + "\r" + "%%EOF" - print("Appending DRM data: %s" % (additional_data)) + #print("Appending DRM data: %s" % (additional_data)) - inp = open(ORIG_FILE, "rb") + inp = open(filename_in, "rb") out = open(filename_out, "wb") out.write(inp.read()) out.write(additional_data.encode("latin-1")) inp.close() out.close() + + drm_end_time = int(time.time() * 1000) + + print("Whole DRM patching took %d milliseconds." % (drm_end_time - drm_start_time)) + + return True \ No newline at end of file