From 36b28765dd9af34f1be3c639c782d32a1ea4911e Mon Sep 17 00:00:00 2001
From: Florian Bach <Leseratte10@vodafone.de>
Date: Sun, 3 Oct 2021 10:30:45 +0200
Subject: [PATCH] v0.0.8: PDF fixes & ACSM file type config

---
 calibre-plugin/__init__.py |  78 +++++++----
 calibre-plugin/fulfill.py  |  52 ++++---
 calibre-plugin/libadobe.py |  34 +++++
 calibre-plugin/libpdf.py   | 277 ++++++++++++++++++++++++++-----------
 4 files changed, 309 insertions(+), 132 deletions(-)

diff --git a/calibre-plugin/__init__.py b/calibre-plugin/__init__.py
index c7fb96c..75e88cb 100644
--- a/calibre-plugin/__init__.py
+++ b/calibre-plugin/__init__.py
@@ -12,10 +12,11 @@
 # v0.0.5: Bugfix: DeDRM plugin was also executed if it's installed but disabled.
 # v0.0.6: First PDF support, allow importing previously exported activation data.
 # v0.0.7: More PDF logging, PDF reading in latin-1, MacOS locale bugfix
+# v0.0.8: More PDF bugfixes, support unlimited PDF file sizes, tell Calibre ACSMs are books.
 
 
 from calibre.customize import FileTypePlugin        # type: ignore
-__version__ = '0.0.7'
+__version__ = '0.0.8'
 
 PLUGIN_NAME = "DeACSM"
 PLUGIN_VERSION_TUPLE = tuple([int(x) for x in __version__.split(".")])
@@ -24,7 +25,7 @@ PLUGIN_VERSION = ".".join([str(x)for x in PLUGIN_VERSION_TUPLE])
 
 from calibre.utils.config import config_dir         # type: ignore
 
-import os, shutil, traceback, sys
+import os, shutil, traceback, sys, time
 import zipfile
 from lxml import etree
 
@@ -49,6 +50,18 @@ class DeACSM(FileTypePlugin):
         """
 
         try:
+
+            # Patch Calibre to consider "ACSM" a book. This makes ACSM files show up
+            # in the "Add Book" file selection, and it also makes the auto-add feature useable.
+            try: 
+                from calibre.ebooks import BOOK_EXTENSIONS
+                if ("acsm" not in BOOK_EXTENSIONS):
+                    BOOK_EXTENSIONS.append("acsm")
+            except:
+                print("{0} v{1}: Couldn't add ACSM to book extension list:".format(PLUGIN_NAME, PLUGIN_VERSION))
+                traceback.print_exc()
+
+
             self.pluginsdir = os.path.join(config_dir,"plugins")
             if not os.path.exists(self.pluginsdir):
                 os.mkdir(self.pluginsdir)
@@ -167,21 +180,21 @@ class DeACSM(FileTypePlugin):
 
 
         try: 
-            from calibre_plugins.deacsm.libadobe import sendHTTPRequest
+            from calibre_plugins.deacsm.libadobe import sendHTTPRequest_DL2FILE
             from calibre_plugins.deacsm.libadobeFulfill import buildRights, fulfill
         except: 
             try: 
-                from libadobe import sendHTTPRequest
+                from libadobe import sendHTTPRequest_DL2FILE
                 from libadobeFulfill import buildRights, fulfill
             except: 
                 print("{0} v{1}: Error while importing Fulfillment stuff".format(PLUGIN_NAME, PLUGIN_VERSION))
                 traceback.print_exc()
 
         try:
-            from calibre_plugins.deacsm.libpdf import patch_drm_into_pdf, prepare_string_from_xml
+            from calibre_plugins.deacsm.libpdf import patch_drm_into_pdf
         except: 
             try: 
-                from libpdf import patch_drm_into_pdf, prepare_string_from_xml
+                from libpdf import patch_drm_into_pdf
             except: 
                 print("{0} v{1}: Error while importing PDF patch".format(PLUGIN_NAME, PLUGIN_VERSION))
                 traceback.print_exc()
@@ -190,9 +203,7 @@ class DeACSM(FileTypePlugin):
         adobe_fulfill_response = etree.fromstring(replyData)
         NSMAP = { "adept" : "http://ns.adobe.com/adept" }
         adNS = lambda tag: '{%s}%s' % ('http://ns.adobe.com/adept', tag)
-        adDC = lambda tag: '{%s}%s' % ('http://purl.org/dc/elements/1.1/', tag)
 
-        metadata_node = adobe_fulfill_response.find("./%s/%s/%s" % (adNS("fulfillmentResult"), adNS("resourceItemInfo"), adNS("metadata")))
         download_url = adobe_fulfill_response.find("./%s/%s/%s" % (adNS("fulfillmentResult"), adNS("resourceItemInfo"), adNS("src"))).text
         license_token_node = adobe_fulfill_response.find("./%s/%s/%s" % (adNS("fulfillmentResult"), adNS("resourceItemInfo"), adNS("licenseToken")))
 
@@ -205,9 +216,25 @@ class DeACSM(FileTypePlugin):
         # Download eBook: 
         print("{0} v{1}: Loading book from {2}".format(PLUGIN_NAME, PLUGIN_VERSION, download_url))
 
-        book_content = sendHTTPRequest(download_url)
+        filename_tmp = self.temporary_file(".blob").name
+
+        dl_start_time = int(time.time() * 1000)
+        ret = sendHTTPRequest_DL2FILE(download_url, filename_tmp)
+        dl_end_time = int(time.time() * 1000)
+
+        print("Download took %d ms (HTTP %d)" % (dl_end_time - dl_start_time, ret))
+
+        if (ret != 200):
+            print("{0} v{1}: Download failed with error {2}".format(PLUGIN_NAME, PLUGIN_VERSION, ret))
+            return None            
+
         filetype = ".bin"
-        
+
+        book_content = None
+
+        with open(filename_tmp, "rb") as f:
+            book_content = f.read(10)
+                
         if (book_content.startswith(b"PK")):
             print("That's a ZIP file -> EPUB")
             filetype = ".epub"
@@ -217,23 +244,9 @@ class DeACSM(FileTypePlugin):
 
         filename = self.temporary_file(filetype).name
 
-        author = "None"
-        title = "None"
-        
-        try: 
-            title = metadata_node.find("./%s" % (adDC("title"))).text
-            author = metadata_node.find("./%s" % (adDC("creator"))).text
+        # Move to file name with matching extension
+        shutil.move(filename_tmp, filename)
 
-            title = title.replace("(", "").replace(")", "").replace("/", "")
-            author = author.replace("(", "").replace(")", "").replace("/", "")
-
-        except:
-            pass
-
-        # Store book:
-        f = open(filename, "wb")
-        f.write(book_content)
-        f.close()
 
         if filetype == ".epub":
             # Store EPUB rights / encryption stuff
@@ -244,10 +257,19 @@ class DeACSM(FileTypePlugin):
             return filename
 
         elif filetype == ".pdf":
+            adobe_fulfill_response = etree.fromstring(rights_xml_str)
+            NSMAP = { "adept" : "http://ns.adobe.com/adept" }
+            adNS = lambda tag: '{%s}%s' % ('http://ns.adobe.com/adept', tag)
+            resource = adobe_fulfill_response.find("./%s/%s" % (adNS("licenseToken"), adNS("resource"))).text
+
             print("{0} v{1}: Downloaded PDF, adding encryption config ...".format(PLUGIN_NAME, PLUGIN_VERSION))
             pdf_tmp_file = self.temporary_file(filetype).name
-            patch_drm_into_pdf(filename, prepare_string_from_xml(rights_xml_str, title, author), pdf_tmp_file)
-            print("{0} v{1}: File successfully fulfilled ...".format(PLUGIN_NAME, PLUGIN_VERSION))
+            ret = patch_drm_into_pdf(filename, rights_xml_str, pdf_tmp_file, resource)
+            if (ret):
+                print("{0} v{1}: File successfully fulfilled ...".format(PLUGIN_NAME, PLUGIN_VERSION))
+            else:
+                print("{0} v{1}: There was an error patching the PDF file.".format(PLUGIN_NAME, PLUGIN_VERSION))
+
             return pdf_tmp_file
         else: 
             print("{0} v{1}: Error: Unsupported file type ...".format(PLUGIN_NAME, PLUGIN_VERSION))
diff --git a/calibre-plugin/fulfill.py b/calibre-plugin/fulfill.py
index 34fa82b..6b0e4ee 100644
--- a/calibre-plugin/fulfill.py
+++ b/calibre-plugin/fulfill.py
@@ -7,7 +7,7 @@ This is an experimental Python version of libgourou.
 
 # pyright: reportUndefinedVariable=false
 
-import sys, os
+import sys, os, time, shutil
 if sys.version_info[0] < 3:
     print("This script requires Python 3.")
     exit(1)
@@ -15,9 +15,9 @@ if sys.version_info[0] < 3:
 import zipfile
 from lxml import etree
 
-from libadobe import sendHTTPRequest
+from libadobe import sendHTTPRequest_DL2FILE
 from libadobeFulfill import buildRights, fulfill
-from libpdf import patch_drm_into_pdf, prepare_string_from_xml
+from libpdf import patch_drm_into_pdf
 
 FILE_DEVICEKEY = "devicesalt"
 FILE_DEVICEXML = "device.xml"
@@ -48,28 +48,31 @@ def download(replyData):
         exit(1)
 
     book_name = None
-    author = "None"
-    title = "None"
+
     try: 
         book_name = metadata_node.find("./%s" % (adDC("title"))).text
     except: 
         book_name = "Book"
     
-    try: 
-        title = metadata_node.find("./%s" % (adDC("title"))).text
-        author = metadata_node.find("./%s" % (adDC("creator"))).text
-
-        title = title.replace("(", "").replace(")", "").replace("/", "")
-        author = author.replace("(", "").replace(")", "").replace("/", "")
-
-    except:
-        pass
 
     # Download eBook: 
 
     print(download_url)
 
-    book_content = sendHTTPRequest(download_url)
+    filename_tmp = book_name + ".tmp"
+
+    dl_start_time = int(time.time() * 1000)
+    ret = sendHTTPRequest_DL2FILE(download_url, filename_tmp)
+    dl_end_time = int(time.time() * 1000)
+    print("Download took %d milliseconds" % (dl_end_time - dl_start_time))
+
+    if (ret != 200):
+        print("Download failed with error %d" % (ret))
+        exit()
+
+    with open(filename_tmp, "rb") as f:
+        book_content = f.read(10)
+
     filetype = ".bin"
     
     if (book_content.startswith(b"PK")):
@@ -80,11 +83,7 @@ def download(replyData):
         filetype = ".pdf"
 
     filename = book_name + filetype
-
-    # Store book:
-    f = open(filename, "wb")
-    f.write(book_content)
-    f.close()
+    shutil.move(filename_tmp, filename)
 
     if filetype == ".epub":
         # Store EPUB rights / encryption stuff
@@ -97,11 +96,20 @@ def download(replyData):
     
     elif filetype == ".pdf":
         print("Successfully downloaded PDF, patching encryption ...")
+
+        adobe_fulfill_response = etree.fromstring(rights_xml_str)
+        NSMAP = { "adept" : "http://ns.adobe.com/adept" }
+        adNS = lambda tag: '{%s}%s' % ('http://ns.adobe.com/adept', tag)
+        resource = adobe_fulfill_response.find("./%s/%s" % (adNS("licenseToken"), adNS("resource"))).text
         
         os.rename(filename, "tmp_" + filename)
-        patch_drm_into_pdf("tmp_" + filename, prepare_string_from_xml(rights_xml_str, author, title), filename)
+        ret = patch_drm_into_pdf("tmp_" + filename, rights_xml_str, filename, resource)
         os.remove("tmp_" + filename)
-        print("File successfully fulfilled to " + filename)
+        if (ret):
+            print("File successfully fulfilled to " + filename)
+        else: 
+            print("Errors occurred while patching " + filename)
+            exit(1)
         exit(0)
     else: 
         print("Error: Weird filetype")
diff --git a/calibre-plugin/libadobe.py b/calibre-plugin/libadobe.py
index 4f75912..5736827 100644
--- a/calibre-plugin/libadobe.py
+++ b/calibre-plugin/libadobe.py
@@ -121,6 +121,40 @@ def makeFingerprint(serial: str):
 
 ############################################## HTTP stuff:
 
+def sendHTTPRequest_DL2FILE(URL: str, outputfile: str):
+    headers = {
+        "Accept": "*/*",
+        "User-Agent": "book2png",
+    }
+    req = urllib.request.Request(url=URL, headers=headers)
+    handler = urllib.request.urlopen(req)
+
+    chunksize = 16 * 1024
+
+    ret_code = handler.getcode()
+
+
+    loc = None
+    try: 
+        loc = req.headers.get("Location")
+    except:
+        pass
+
+    if loc is not None: 
+        return sendHTTPRequest_DL2FILE(loc)
+
+    if ret_code != 200:
+        return ret_code
+
+    with open(outputfile, "wb") as f:
+        while True: 
+            chunk = handler.read(chunksize)
+            if not chunk: 
+                break
+            f.write(chunk)
+
+    return 200
+
 def sendHTTPRequest_getSimple(URL: str):
 
     headers = {
diff --git a/calibre-plugin/libpdf.py b/calibre-plugin/libpdf.py
index 82721af..fbb8c49 100644
--- a/calibre-plugin/libpdf.py
+++ b/calibre-plugin/libpdf.py
@@ -1,112 +1,221 @@
-import os, zlib, base64
-from lxml import etree
+import os, zlib, base64, time
 
+class BackwardReader:
 
-def read_reverse_order(file_name):
-    # Open file for reading in binary mode
-    with open(file_name, 'rb') as read_obj:
-        # Move the cursor to the end of the file
-        read_obj.seek(0, os.SEEK_END)
-        # Get the current position of pointer i.e eof
-        pointer_location = read_obj.tell()
-        # Create a buffer to keep the last read line
+    def __init__(self, file):
+        self.file = file
+
+    def readlines(self):
+        BLKSIZE = 4096
+        # Move reader to the end of file
+        self.file.seek(0, os.SEEK_END)
         buffer = bytearray()
-        # Loop till pointer reaches the top of the file
-        while pointer_location >= 0:
-            # Move the file pointer to the location pointed by pointer_location
-            read_obj.seek(pointer_location)
-            # Shift pointer location by -1
-            pointer_location = pointer_location -1
-            # read that byte / character
-            new_byte = read_obj.read(1)
-            # If the read byte is new line character then it means one line is read
-            if new_byte == b'\n':
-                # Fetch the line from buffer and yield it
-                yield buffer.decode("latin-1")[::-1]
-                # Reinitialize the byte array to save next line
-                buffer = bytearray()
+
+        while True:
+            pos_newline = buffer.rfind(bytes([0x0a]))
+            # Get the current position of the reader
+            current_pos = self.file.tell()
+            if pos_newline != -1:
+                # Newline is found
+                line = buffer[pos_newline+1:]
+                buffer = buffer[:pos_newline]
+                yield line.decode("latin-1")
+            elif current_pos:
+                # Need to fill the buffer
+                to_read = min(BLKSIZE, current_pos)
+                self.file.seek(current_pos-to_read, 0)
+                buffer = self.file.read(to_read) + buffer
+                self.file.seek(current_pos-to_read, 0)
+                if current_pos is to_read:
+                    buffer = bytes([0x0a]) + buffer
             else:
-                # If last read character is not eol then add it in buffer
-                buffer.extend(new_byte)
-        # As file is read completely, if there is still data in buffer, then its the first line.
-        if len(buffer) > 0:
-            # Yield the first line too
-            yield buffer.decode("latin-1")[::-1]
+                # Start of file
+                return
+
+
+
+def trim_encrypt_string(encrypt):
+
+    string_list = list(encrypt)
+    strlen = len(encrypt)
+
+    i = 0
+    bracket_count = 0
+    while (i < strlen):
+        if string_list[i] == "<" and string_list[i+1] == "<":
+            bracket_count += 1
+
+        if string_list[i] == ">" and string_list[i+1] == ">":
+            bracket_count -= 1
+
+        if bracket_count == 0: 
+            break
+
+        i = i + 1
+
+    len_to_use = i+2
+
+    return encrypt[0:len_to_use]
+
+def cleanup_encrypt_element(element):
+
+    if element.startswith("ID[<"):
+        element = element.replace("><", "> <")
+
+    element = ' '.join(element.split())
+    element = element.replace("[ ", "[").replace("] ", "]")
+
+    return element
+
+
+
 
 def deflate_and_base64_encode( string_val ):
     zlibbed_str = zlib.compress( string_val )
     compressed_string = zlibbed_str[2:-4]
     return base64.b64encode( compressed_string )
 
-def prepare_string_from_xml(xmlstring, title, author):
-    b64data = deflate_and_base64_encode(xmlstring.encode("utf-8")).decode("utf-8")
+def update_ebx_with_keys(ebx_data, adept_license, ebx_bookid):
 
-    adobe_fulfill_response = etree.fromstring(xmlstring)
-    NSMAP = { "adept" : "http://ns.adobe.com/adept" }
-    adNS = lambda tag: '{%s}%s' % ('http://ns.adobe.com/adept', tag)
-    resource = adobe_fulfill_response.find("./%s/%s" % (adNS("licenseToken"), adNS("resource"))).text
+    b64data = deflate_and_base64_encode(adept_license.encode("utf-8")).decode("utf-8")
 
-    return "<</Length 128/EBX_TITLE(%s)/Filter/EBX_HANDLER/EBX_AUTHOR(%s)/V 4/ADEPT_ID(%s)/EBX_BOOKID(%s)/ADEPT_LICENSE(%s)>>" % (title, author, resource, resource, b64data)
+    ebx_new = ebx_data[:-2]
+    ebx_new += "/EBX_BOOKID(%s)/ADEPT_LICENSE(%s)>>" % (ebx_bookid, b64data)
 
-def patch_drm_into_pdf(filename_in, drm_string, filename_out):
+    return ebx_new
 
-    ORIG_FILE = filename_in
+
+def find_ebx(filename_in):
+    find_ebx_start = int(time.time() * 1000)
+    i = 0
+
+    fl = open(filename_in, "rb")
+    br = BackwardReader(fl)
+
+    for line in br.readlines():
+        i = i + 1
+        if "/EBX_HANDLER/" in line:
+            find_ebx_end = int(time.time() * 1000)
+            print("Found EBX after %d attempts - took %d ms" % (i, find_ebx_end - find_ebx_start))
+            return line
+
+    find_ebx_end = int(time.time() * 1000)
+    print("Error: Did not find EBX_HANDLER - took %d ms" % (find_ebx_end - find_ebx_start))
+    return None
+
+def find_enc(filename_in):
+    find_enc_start = int(time.time() * 1000)
+    i = 0
+
+    fl = open(filename_in, "rb")
+    br = BackwardReader(fl)
+
+    for line in br.readlines():
+        i = i + 1
+        if "R/Encrypt" in line and "R/ID" in line:
+            find_enc_end = int(time.time() * 1000)
+            print("Found ENC after %d attempts - took %d ms" % (i, find_enc_end - find_enc_start))
+            return line
+    
+    find_enc_end = int(time.time() * 1000)
+    print("Error: Did not find ENC - took %d ms" % (find_enc_end - find_enc_start))
+    return None
+
+
+
+def patch_drm_into_pdf(filename_in, adept_license_string, filename_out, ebx_bookid):
+
+    drm_start_time = int(time.time() * 1000)
 
     trailer = ""
     trailer_idx = 0
 
-    print("DRM data is %s" % (drm_string))
+    startxref_offset = 0
+    prevline = ""
 
-    for line in read_reverse_order(ORIG_FILE):
+
+    fl = open(filename_in, "rb")
+    br = BackwardReader(fl)
+
+    print("Searching for startxref ...")
+    for line in br.readlines():
         trailer_idx += 1
         trailer = line + "\n" + trailer
-        print("DEBUG: pdfdata[%d] = %s" % (trailer_idx, line))
-        if (trailer_idx == 20):
-            print("trailer_idx is very large (%d). Usually it's 10 or less. File might be corrupted." % trailer_idx)
-        if (line == "trailer"): 
-            print("Found trailer at idx %d" % (trailer_idx))
+
+        print ("LINE: " + line)
+
+        if (trailer_idx > 10):
+            print("Took more than 10 attempts to find startxref ...")
+            return False
+        
+        if (line == "startxref"):
+            startxref_offset = int(prevline)
+            print("Got startxref: %d" % (startxref_offset))            
             break
+        prevline = line
+
+
 
     r_encrypt_offs1 = 0
     r_encrypt_offs2 = 0
-    root_str = None
-    next_startxref = False
-    startxref = None
 
-    for line in trailer.split('\n'):
-        #print(line)
-        if ("R/Encrypt" in line):
-            root_str = line
-            line_split = line.split(' ')
+    encrypt = None
+
+
+    encrypt = find_enc(filename_in)
+    if encrypt is None:
+        print("Error, enc not found")
+        return False
+
+    line_split = encrypt.split(' ')
+    next = 0
+    for element in line_split:
+        if element == "R/Encrypt":
+            next = 2
+            continue
+        if next == 2:
+            r_encrypt_offs1 = element
+            next = 1
+            continue
+        if next == 1: 
+            r_encrypt_offs2 = element
             next = 0
-            for element in line_split:
-                if element == "R/Encrypt":
-                    next = 2
-                    continue
-                if next == 2:
-                    r_encrypt_offs1 = element
-                    next = 1
-                    continue
-                if next == 1: 
-                    r_encrypt_offs2 = element
-                    next = 0
-                    continue
-        if "startxref" in line: 
-            next_startxref = True
-            continue
-        if next_startxref:
-            startxref = line
-            next_startxref = False
             continue
+
+
+    # read EBX element:
+    ebx_elem = find_ebx(filename_in)
+    
+    if (ebx_elem is None):
+        print("Err: EBX is None")
+        return False
+
+    
+    print("")
+    print("")
+    print("Encryption handler:")
+    print(encrypt)
+    print("EBX handler:")
+    print(ebx_elem)
+
+    encrypt = trim_encrypt_string(encrypt)
+
+    print("Trimmed encryption handler:")
+    print(encrypt)
+
+    ebx_elem = update_ebx_with_keys(ebx_elem, adept_license_string, ebx_bookid)
+
+    print("Updated EBX handler not logged due to sensitive data")
+    #print(ebx_elem)
         
 
-    filesize_str = str(os.path.getsize(ORIG_FILE))
+    filesize_str = str(os.path.getsize(filename_in))
     filesize_pad = filesize_str.zfill(10)
 
 
     additional_data = "\r"
     additional_data += r_encrypt_offs1 + " " + r_encrypt_offs2 + " " + "obj" + "\r"
-    additional_data += drm_string
+    additional_data += ebx_elem
     additional_data += "\r"
     additional_data += "endobj"
 
@@ -117,36 +226,40 @@ def patch_drm_into_pdf(filename_in, drm_string, filename_out):
     additional_data += "trailer"
     additional_data += "\r"
 
-    arr_root_str = root_str.split('/')
+    arr_root_str = encrypt.split('/')
     did_prev = False
     for elem in arr_root_str: 
         if elem.startswith("Prev"):
             did_prev = True
-            additional_data += "Prev " + startxref
+            additional_data += "Prev " + str(startxref_offset)
             #print("Replacing prev from '%s' to '%s'" % (elem, "Prev " + startxref))
-        elif elem.startswith("ID[<"):
-            additional_data += elem.replace("><", "> <")
         else:
-            additional_data += elem
+            additional_data += cleanup_encrypt_element(elem)
         additional_data += "/"
 
     if not did_prev:
         # remove two >> at end
         additional_data = additional_data[:-3]
-        additional_data += "/Prev " + startxref + ">>" + "/"
+        additional_data += "/Prev " + str(startxref_offset) + ">>" + "/"
         #print("Faking Prev %s" % startxref)
 
     additional_data = additional_data[:-1]
 
     additional_data += "\r" + "startxref\r" + str(ptr) + "\r" + "%%EOF"
 
-    print("Appending DRM data: %s" % (additional_data))
+    #print("Appending DRM data: %s" % (additional_data))
 
 
-    inp = open(ORIG_FILE, "rb")
+    inp = open(filename_in, "rb")
 
     out = open(filename_out, "wb")
     out.write(inp.read())
     out.write(additional_data.encode("latin-1"))
     inp.close()
     out.close()
+
+    drm_end_time = int(time.time() * 1000)
+
+    print("Whole DRM patching took %d milliseconds." % (drm_end_time - drm_start_time))
+
+    return True
\ No newline at end of file