v0.0.8: PDF fixes & ACSM file type config

2024-12-22 17:29:56 +06:00 · 2021-10-03 10:30:45 +02:00 · 2021-10-03 10:30:45 +02:00 · 36b28765dd
commit 36b28765dd
parent 61bebdd906
4 changed files with 309 additions and 132 deletions
--- a/calibre-plugin/init.py
+++ b/calibre-plugin/init.py
@ -12,10 +12,11 @@
 # v0.0.5: Bugfix: DeDRM plugin was also executed if it's installed but disabled.
 # v0.0.6: First PDF support, allow importing previously exported activation data.
 # v0.0.7: More PDF logging, PDF reading in latin-1, MacOS locale bugfix
+# v0.0.8: More PDF bugfixes, support unlimited PDF file sizes, tell Calibre ACSMs are books.


 from calibre.customize import FileTypePlugin        # type: ignore
-__version__ = '0.0.7'
+__version__ = '0.0.8'

 PLUGIN_NAME = "DeACSM"
 PLUGIN_VERSION_TUPLE = tuple([int(x) for x in __version__.split(".")])
@ -24,7 +25,7 @@ PLUGIN_VERSION = ".".join([str(x)for x in PLUGIN_VERSION_TUPLE])

 from calibre.utils.config import config_dir         # type: ignore

-import os, shutil, traceback, sys
+import os, shutil, traceback, sys, time
 import zipfile
 from lxml import etree

@ -49,6 +50,18 @@ class DeACSM(FileTypePlugin):
        """

        try:
+
+            # Patch Calibre to consider "ACSM" a book. This makes ACSM files show up
+            # in the "Add Book" file selection, and it also makes the auto-add feature useable.
+            try: 
+                from calibre.ebooks import BOOK_EXTENSIONS
+                if ("acsm" not in BOOK_EXTENSIONS):
+                    BOOK_EXTENSIONS.append("acsm")
+            except:
+                print("{0} v{1}: Couldn't add ACSM to book extension list:".format(PLUGIN_NAME, PLUGIN_VERSION))
+                traceback.print_exc()
+
+
            self.pluginsdir = os.path.join(config_dir,"plugins")
            if not os.path.exists(self.pluginsdir):
                os.mkdir(self.pluginsdir)
@ -167,21 +180,21 @@ class DeACSM(FileTypePlugin):


        try: 
-            from calibre_plugins.deacsm.libadobe import sendHTTPRequest
+            from calibre_plugins.deacsm.libadobe import sendHTTPRequest_DL2FILE
            from calibre_plugins.deacsm.libadobeFulfill import buildRights, fulfill
        except: 
            try: 
-                from libadobe import sendHTTPRequest
+                from libadobe import sendHTTPRequest_DL2FILE
                from libadobeFulfill import buildRights, fulfill
            except: 
                print("{0} v{1}: Error while importing Fulfillment stuff".format(PLUGIN_NAME, PLUGIN_VERSION))
                traceback.print_exc()

        try:
-            from calibre_plugins.deacsm.libpdf import patch_drm_into_pdf, prepare_string_from_xml
+            from calibre_plugins.deacsm.libpdf import patch_drm_into_pdf
        except: 
            try: 
-                from libpdf import patch_drm_into_pdf, prepare_string_from_xml
+                from libpdf import patch_drm_into_pdf
            except: 
                print("{0} v{1}: Error while importing PDF patch".format(PLUGIN_NAME, PLUGIN_VERSION))
                traceback.print_exc()
@ -190,9 +203,7 @@ class DeACSM(FileTypePlugin):
        adobe_fulfill_response = etree.fromstring(replyData)
        NSMAP = { "adept" : "http://ns.adobe.com/adept" }
        adNS = lambda tag: '{%s}%s' % ('http://ns.adobe.com/adept', tag)
-        adDC = lambda tag: '{%s}%s' % ('http://purl.org/dc/elements/1.1/', tag)

-        metadata_node = adobe_fulfill_response.find("./%s/%s/%s" % (adNS("fulfillmentResult"), adNS("resourceItemInfo"), adNS("metadata")))
        download_url = adobe_fulfill_response.find("./%s/%s/%s" % (adNS("fulfillmentResult"), adNS("resourceItemInfo"), adNS("src"))).text
        license_token_node = adobe_fulfill_response.find("./%s/%s/%s" % (adNS("fulfillmentResult"), adNS("resourceItemInfo"), adNS("licenseToken")))

@ -205,9 +216,25 @@ class DeACSM(FileTypePlugin):
        # Download eBook: 
        print("{0} v{1}: Loading book from {2}".format(PLUGIN_NAME, PLUGIN_VERSION, download_url))

-        book_content = sendHTTPRequest(download_url)
+        filename_tmp = self.temporary_file(".blob").name
+
+        dl_start_time = int(time.time() * 1000)
+        ret = sendHTTPRequest_DL2FILE(download_url, filename_tmp)
+        dl_end_time = int(time.time() * 1000)
+
+        print("Download took %d ms (HTTP %d)" % (dl_end_time - dl_start_time, ret))
+
+        if (ret != 200):
+            print("{0} v{1}: Download failed with error {2}".format(PLUGIN_NAME, PLUGIN_VERSION, ret))
+            return None            
+
        filetype = ".bin"

+        book_content = None
+
+        with open(filename_tmp, "rb") as f:
+            book_content = f.read(10)
+                
        if (book_content.startswith(b"PK")):
            print("That's a ZIP file -> EPUB")
            filetype = ".epub"
@ -217,23 +244,9 @@ class DeACSM(FileTypePlugin):

        filename = self.temporary_file(filetype).name

-        author = "None"
-        title = "None"
+        # Move to file name with matching extension
+        shutil.move(filename_tmp, filename)

-        try: 
-            title = metadata_node.find("./%s" % (adDC("title"))).text
-            author = metadata_node.find("./%s" % (adDC("creator"))).text
-
-            title = title.replace("(", "").replace(")", "").replace("/", "")
-            author = author.replace("(", "").replace(")", "").replace("/", "")
-
-        except:
-            pass
-
-        # Store book:
-        f = open(filename, "wb")
-        f.write(book_content)
-        f.close()

        if filetype == ".epub":
            # Store EPUB rights / encryption stuff
@ -244,10 +257,19 @@ class DeACSM(FileTypePlugin):
            return filename

        elif filetype == ".pdf":
+            adobe_fulfill_response = etree.fromstring(rights_xml_str)
+            NSMAP = { "adept" : "http://ns.adobe.com/adept" }
+            adNS = lambda tag: '{%s}%s' % ('http://ns.adobe.com/adept', tag)
+            resource = adobe_fulfill_response.find("./%s/%s" % (adNS("licenseToken"), adNS("resource"))).text
+
            print("{0} v{1}: Downloaded PDF, adding encryption config ...".format(PLUGIN_NAME, PLUGIN_VERSION))
            pdf_tmp_file = self.temporary_file(filetype).name
-            patch_drm_into_pdf(filename, prepare_string_from_xml(rights_xml_str, title, author), pdf_tmp_file)
-            print("{0} v{1}: File successfully fulfilled ...".format(PLUGIN_NAME, PLUGIN_VERSION))
+            ret = patch_drm_into_pdf(filename, rights_xml_str, pdf_tmp_file, resource)
+            if (ret):
+                print("{0} v{1}: File successfully fulfilled ...".format(PLUGIN_NAME, PLUGIN_VERSION))
+            else:
+                print("{0} v{1}: There was an error patching the PDF file.".format(PLUGIN_NAME, PLUGIN_VERSION))
+
            return pdf_tmp_file
        else: 
            print("{0} v{1}: Error: Unsupported file type ...".format(PLUGIN_NAME, PLUGIN_VERSION))
--- a/calibre-plugin/fulfill.py
+++ b/calibre-plugin/fulfill.py
@ -7,7 +7,7 @@ This is an experimental Python version of libgourou.

 # pyright: reportUndefinedVariable=false

-import sys, os
+import sys, os, time, shutil
 if sys.version_info[0] < 3:
    print("This script requires Python 3.")
    exit(1)
@ -15,9 +15,9 @@ if sys.version_info[0] < 3:
 import zipfile
 from lxml import etree

-from libadobe import sendHTTPRequest
+from libadobe import sendHTTPRequest_DL2FILE
 from libadobeFulfill import buildRights, fulfill
-from libpdf import patch_drm_into_pdf, prepare_string_from_xml
+from libpdf import patch_drm_into_pdf

 FILE_DEVICEKEY = "devicesalt"
 FILE_DEVICEXML = "device.xml"
@ -48,28 +48,31 @@ def download(replyData):
        exit(1)

    book_name = None
-    author = "None"
-    title = "None"
+
    try: 
        book_name = metadata_node.find("./%s" % (adDC("title"))).text
    except: 
        book_name = "Book"
    
-    try: 
-        title = metadata_node.find("./%s" % (adDC("title"))).text
-        author = metadata_node.find("./%s" % (adDC("creator"))).text
-
-        title = title.replace("(", "").replace(")", "").replace("/", "")
-        author = author.replace("(", "").replace(")", "").replace("/", "")
-
-    except:
-        pass

    # Download eBook: 

    print(download_url)

-    book_content = sendHTTPRequest(download_url)
+    filename_tmp = book_name + ".tmp"
+
+    dl_start_time = int(time.time() * 1000)
+    ret = sendHTTPRequest_DL2FILE(download_url, filename_tmp)
+    dl_end_time = int(time.time() * 1000)
+    print("Download took %d milliseconds" % (dl_end_time - dl_start_time))
+
+    if (ret != 200):
+        print("Download failed with error %d" % (ret))
+        exit()
+
+    with open(filename_tmp, "rb") as f:
+        book_content = f.read(10)
+
    filetype = ".bin"
    
    if (book_content.startswith(b"PK")):
@ -80,11 +83,7 @@ def download(replyData):
        filetype = ".pdf"

    filename = book_name + filetype
-
-    # Store book:
-    f = open(filename, "wb")
-    f.write(book_content)
-    f.close()
+    shutil.move(filename_tmp, filename)

    if filetype == ".epub":
        # Store EPUB rights / encryption stuff
@ -98,10 +97,19 @@ def download(replyData):
    elif filetype == ".pdf":
        print("Successfully downloaded PDF, patching encryption ...")

+        adobe_fulfill_response = etree.fromstring(rights_xml_str)
+        NSMAP = { "adept" : "http://ns.adobe.com/adept" }
+        adNS = lambda tag: '{%s}%s' % ('http://ns.adobe.com/adept', tag)
+        resource = adobe_fulfill_response.find("./%s/%s" % (adNS("licenseToken"), adNS("resource"))).text
+        
        os.rename(filename, "tmp_" + filename)
-        patch_drm_into_pdf("tmp_" + filename, prepare_string_from_xml(rights_xml_str, author, title), filename)
+        ret = patch_drm_into_pdf("tmp_" + filename, rights_xml_str, filename, resource)
        os.remove("tmp_" + filename)
-        print("File successfully fulfilled to " + filename)
+        if (ret):
+            print("File successfully fulfilled to " + filename)
+        else: 
+            print("Errors occurred while patching " + filename)
+            exit(1)
        exit(0)
    else: 
        print("Error: Weird filetype")
--- a/calibre-plugin/libadobe.py
+++ b/calibre-plugin/libadobe.py
@ -121,6 +121,40 @@ def makeFingerprint(serial: str):

 ############################################## HTTP stuff:

+def sendHTTPRequest_DL2FILE(URL: str, outputfile: str):
+    headers = {
+        "Accept": "*/*",
+        "User-Agent": "book2png",
+    }
+    req = urllib.request.Request(url=URL, headers=headers)
+    handler = urllib.request.urlopen(req)
+
+    chunksize = 16 * 1024
+
+    ret_code = handler.getcode()
+
+
+    loc = None
+    try: 
+        loc = req.headers.get("Location")
+    except:
+        pass
+
+    if loc is not None: 
+        return sendHTTPRequest_DL2FILE(loc)
+
+    if ret_code != 200:
+        return ret_code
+
+    with open(outputfile, "wb") as f:
+        while True: 
+            chunk = handler.read(chunksize)
+            if not chunk: 
+                break
+            f.write(chunk)
+
+    return 200
+
 def sendHTTPRequest_getSimple(URL: str):

    headers = {
--- a/calibre-plugin/libpdf.py
+++ b/calibre-plugin/libpdf.py
@ -1,112 +1,221 @@
-import os, zlib, base64
-from lxml import etree
+import os, zlib, base64, time

+class BackwardReader:

-def read_reverse_order(file_name):
-    # Open file for reading in binary mode
-    with open(file_name, 'rb') as read_obj:
-        # Move the cursor to the end of the file
-        read_obj.seek(0, os.SEEK_END)
-        # Get the current position of pointer i.e eof
-        pointer_location = read_obj.tell()
-        # Create a buffer to keep the last read line
+    def __init__(self, file):
+        self.file = file
+
+    def readlines(self):
+        BLKSIZE = 4096
+        # Move reader to the end of file
+        self.file.seek(0, os.SEEK_END)
        buffer = bytearray()
-        # Loop till pointer reaches the top of the file
-        while pointer_location >= 0:
-            # Move the file pointer to the location pointed by pointer_location
-            read_obj.seek(pointer_location)
-            # Shift pointer location by -1
-            pointer_location = pointer_location -1
-            # read that byte / character
-            new_byte = read_obj.read(1)
-            # If the read byte is new line character then it means one line is read
-            if new_byte == b'\n':
-                # Fetch the line from buffer and yield it
-                yield buffer.decode("latin-1")[::-1]
-                # Reinitialize the byte array to save next line
-                buffer = bytearray()
+
+        while True:
+            pos_newline = buffer.rfind(bytes([0x0a]))
+            # Get the current position of the reader
+            current_pos = self.file.tell()
+            if pos_newline != -1:
+                # Newline is found
+                line = buffer[pos_newline+1:]
+                buffer = buffer[:pos_newline]
+                yield line.decode("latin-1")
+            elif current_pos:
+                # Need to fill the buffer
+                to_read = min(BLKSIZE, current_pos)
+                self.file.seek(current_pos-to_read, 0)
+                buffer = self.file.read(to_read) + buffer
+                self.file.seek(current_pos-to_read, 0)
+                if current_pos is to_read:
+                    buffer = bytes([0x0a]) + buffer
            else:
-                # If last read character is not eol then add it in buffer
-                buffer.extend(new_byte)
-        # As file is read completely, if there is still data in buffer, then its the first line.
-        if len(buffer) > 0:
-            # Yield the first line too
-            yield buffer.decode("latin-1")[::-1]
+                # Start of file
+                return
+
+
+
+def trim_encrypt_string(encrypt):
+
+    string_list = list(encrypt)
+    strlen = len(encrypt)
+
+    i = 0
+    bracket_count = 0
+    while (i < strlen):
+        if string_list[i] == "<" and string_list[i+1] == "<":
+            bracket_count += 1
+
+        if string_list[i] == ">" and string_list[i+1] == ">":
+            bracket_count -= 1
+
+        if bracket_count == 0: 
+            break
+
+        i = i + 1
+
+    len_to_use = i+2
+
+    return encrypt[0:len_to_use]
+
+def cleanup_encrypt_element(element):
+
+    if element.startswith("ID[<"):
+        element = element.replace("><", "> <")
+
+    element = ' '.join(element.split())
+    element = element.replace("[ ", "[").replace("] ", "]")
+
+    return element
+
+
+

 def deflate_and_base64_encode( string_val ):
    zlibbed_str = zlib.compress( string_val )
    compressed_string = zlibbed_str[2:-4]
    return base64.b64encode( compressed_string )

-def prepare_string_from_xml(xmlstring, title, author):
-    b64data = deflate_and_base64_encode(xmlstring.encode("utf-8")).decode("utf-8")
+def update_ebx_with_keys(ebx_data, adept_license, ebx_bookid):

-    adobe_fulfill_response = etree.fromstring(xmlstring)
-    NSMAP = { "adept" : "http://ns.adobe.com/adept" }
-    adNS = lambda tag: '{%s}%s' % ('http://ns.adobe.com/adept', tag)
-    resource = adobe_fulfill_response.find("./%s/%s" % (adNS("licenseToken"), adNS("resource"))).text
+    b64data = deflate_and_base64_encode(adept_license.encode("utf-8")).decode("utf-8")

-    return "<</Length 128/EBX_TITLE(%s)/Filter/EBX_HANDLER/EBX_AUTHOR(%s)/V 4/ADEPT_ID(%s)/EBX_BOOKID(%s)/ADEPT_LICENSE(%s)>>" % (title, author, resource, resource, b64data)
+    ebx_new = ebx_data[:-2]
+    ebx_new += "/EBX_BOOKID(%s)/ADEPT_LICENSE(%s)>>" % (ebx_bookid, b64data)

-def patch_drm_into_pdf(filename_in, drm_string, filename_out):
+    return ebx_new

-    ORIG_FILE = filename_in
+
+def find_ebx(filename_in):
+    find_ebx_start = int(time.time() * 1000)
+    i = 0
+
+    fl = open(filename_in, "rb")
+    br = BackwardReader(fl)
+
+    for line in br.readlines():
+        i = i + 1
+        if "/EBX_HANDLER/" in line:
+            find_ebx_end = int(time.time() * 1000)
+            print("Found EBX after %d attempts - took %d ms" % (i, find_ebx_end - find_ebx_start))
+            return line
+
+    find_ebx_end = int(time.time() * 1000)
+    print("Error: Did not find EBX_HANDLER - took %d ms" % (find_ebx_end - find_ebx_start))
+    return None
+
+def find_enc(filename_in):
+    find_enc_start = int(time.time() * 1000)
+    i = 0
+
+    fl = open(filename_in, "rb")
+    br = BackwardReader(fl)
+
+    for line in br.readlines():
+        i = i + 1
+        if "R/Encrypt" in line and "R/ID" in line:
+            find_enc_end = int(time.time() * 1000)
+            print("Found ENC after %d attempts - took %d ms" % (i, find_enc_end - find_enc_start))
+            return line
+    
+    find_enc_end = int(time.time() * 1000)
+    print("Error: Did not find ENC - took %d ms" % (find_enc_end - find_enc_start))
+    return None
+
+
+
+def patch_drm_into_pdf(filename_in, adept_license_string, filename_out, ebx_bookid):
+
+    drm_start_time = int(time.time() * 1000)

    trailer = ""
    trailer_idx = 0

-    print("DRM data is %s" % (drm_string))
+    startxref_offset = 0
+    prevline = ""

-    for line in read_reverse_order(ORIG_FILE):
+
+    fl = open(filename_in, "rb")
+    br = BackwardReader(fl)
+
+    print("Searching for startxref ...")
+    for line in br.readlines():
        trailer_idx += 1
        trailer = line + "\n" + trailer
-        print("DEBUG: pdfdata[%d] = %s" % (trailer_idx, line))
-        if (trailer_idx == 20):
-            print("trailer_idx is very large (%d). Usually it's 10 or less. File might be corrupted." % trailer_idx)
-        if (line == "trailer"): 
-            print("Found trailer at idx %d" % (trailer_idx))
+
+        print ("LINE: " + line)
+
+        if (trailer_idx > 10):
+            print("Took more than 10 attempts to find startxref ...")
+            return False
+        
+        if (line == "startxref"):
+            startxref_offset = int(prevline)
+            print("Got startxref: %d" % (startxref_offset))            
            break
+        prevline = line
+
+

    r_encrypt_offs1 = 0
    r_encrypt_offs2 = 0
-    root_str = None
-    next_startxref = False
-    startxref = None

-    for line in trailer.split('\n'):
-        #print(line)
-        if ("R/Encrypt" in line):
-            root_str = line
-            line_split = line.split(' ')
+    encrypt = None
+
+
+    encrypt = find_enc(filename_in)
+    if encrypt is None:
+        print("Error, enc not found")
+        return False
+
+    line_split = encrypt.split(' ')
+    next = 0
+    for element in line_split:
+        if element == "R/Encrypt":
+            next = 2
+            continue
+        if next == 2:
+            r_encrypt_offs1 = element
+            next = 1
+            continue
+        if next == 1: 
+            r_encrypt_offs2 = element
            next = 0
-            for element in line_split:
-                if element == "R/Encrypt":
-                    next = 2
-                    continue
-                if next == 2:
-                    r_encrypt_offs1 = element
-                    next = 1
-                    continue
-                if next == 1: 
-                    r_encrypt_offs2 = element
-                    next = 0
-                    continue
-        if "startxref" in line: 
-            next_startxref = True
-            continue
-        if next_startxref:
-            startxref = line
-            next_startxref = False
            continue


-    filesize_str = str(os.path.getsize(ORIG_FILE))
+    # read EBX element:
+    ebx_elem = find_ebx(filename_in)
+    
+    if (ebx_elem is None):
+        print("Err: EBX is None")
+        return False
+
+    
+    print("")
+    print("")
+    print("Encryption handler:")
+    print(encrypt)
+    print("EBX handler:")
+    print(ebx_elem)
+
+    encrypt = trim_encrypt_string(encrypt)
+
+    print("Trimmed encryption handler:")
+    print(encrypt)
+
+    ebx_elem = update_ebx_with_keys(ebx_elem, adept_license_string, ebx_bookid)
+
+    print("Updated EBX handler not logged due to sensitive data")
+    #print(ebx_elem)
+        
+
+    filesize_str = str(os.path.getsize(filename_in))
    filesize_pad = filesize_str.zfill(10)


    additional_data = "\r"
    additional_data += r_encrypt_offs1 + " " + r_encrypt_offs2 + " " + "obj" + "\r"
-    additional_data += drm_string
+    additional_data += ebx_elem
    additional_data += "\r"
    additional_data += "endobj"

@ -117,36 +226,40 @@ def patch_drm_into_pdf(filename_in, drm_string, filename_out):
    additional_data += "trailer"
    additional_data += "\r"

-    arr_root_str = root_str.split('/')
+    arr_root_str = encrypt.split('/')
    did_prev = False
    for elem in arr_root_str: 
        if elem.startswith("Prev"):
            did_prev = True
-            additional_data += "Prev " + startxref
+            additional_data += "Prev " + str(startxref_offset)
            #print("Replacing prev from '%s' to '%s'" % (elem, "Prev " + startxref))
-        elif elem.startswith("ID[<"):
-            additional_data += elem.replace("><", "> <")
        else:
-            additional_data += elem
+            additional_data += cleanup_encrypt_element(elem)
        additional_data += "/"

    if not did_prev:
        # remove two >> at end
        additional_data = additional_data[:-3]
-        additional_data += "/Prev " + startxref + ">>" + "/"
+        additional_data += "/Prev " + str(startxref_offset) + ">>" + "/"
        #print("Faking Prev %s" % startxref)

    additional_data = additional_data[:-1]

    additional_data += "\r" + "startxref\r" + str(ptr) + "\r" + "%%EOF"

-    print("Appending DRM data: %s" % (additional_data))
+    #print("Appending DRM data: %s" % (additional_data))


-    inp = open(ORIG_FILE, "rb")
+    inp = open(filename_in, "rb")

    out = open(filename_out, "wb")
    out.write(inp.read())
    out.write(additional_data.encode("latin-1"))
    inp.close()
    out.close()
+
+    drm_end_time = int(time.time() * 1000)
+
+    print("Whole DRM patching took %d milliseconds." % (drm_end_time - drm_start_time))
+
+    return True