# This is a python script. You need a Python interpreter to run it. # For example, ActiveState Python, which exists for windows. # # Big Thanks to Igor SKOCHINSKY for providing me with all his information # and source code relating to the inner workings of this compression scheme. # Without it, I wouldn't be able to solve this as easily. # # Changelog # 0.01 - Initial version # 0.02 - Fix issue with size computing # 0.03 - Fix issue with some files # 0.04 - make stdout self flushing and fix return values class Unbuffered: def __init__(self, stream): self.stream = stream def write(self, data): self.stream.write(data) self.stream.flush() def __getattr__(self, attr): return getattr(self.stream, attr) import sys sys.stdout=Unbuffered(sys.stdout) import struct class BitReader: def __init__(self, data): self.data, self.pos, self.nbits = data + "\x00\x00\x00\x00", 0, len(data) * 8 def peek(self, n): r, g = 0, 0 while g < n: r, g = (r << 8) | ord(self.data[(self.pos+g)>>3]), g + 8 - ((self.pos+g) & 7) return (r >> (g - n)) & ((1 << n) - 1) def eat(self, n): self.pos += n return self.pos <= self.nbits def left(self): return self.nbits - self.pos class HuffReader: def __init__(self, huffs): self.huffs = huffs h = huffs[0] if huffs[0][0:4] != 'HUFF' or huffs[0][4:8] != '\x00\x00\x00\x18': raise ValueError('invalid huff1 header') if huffs[1][0:4] != 'CDIC' or huffs[1][4:8] != '\x00\x00\x00\x10': raise ValueError('invalid huff2 header') self.entry_bits, = struct.unpack('>L', huffs[1][12:16]) off1,off2 = struct.unpack('>LL', huffs[0][16:24]) self.dict1 = struct.unpack('<256L', huffs[0][off1:off1+256*4]) self.dict2 = struct.unpack('<64L', huffs[0][off2:off2+64*4]) self.dicts = huffs[1:] self.r = '' def _unpack(self, bits, depth = 0): if depth > 32: raise ValueError('corrupt file') while bits.left(): dw = bits.peek(32) v = self.dict1[dw >> 24] codelen = v & 0x1F assert codelen != 0 code = dw >> (32 - codelen) r = (v >> 8) if not (v & 0x80): while code < self.dict2[(codelen-1)*2]: codelen += 1 code = dw >> (32 - codelen) r = self.dict2[(codelen-1)*2+1] r -= code assert codelen != 0 if not bits.eat(codelen): return dicno = r >> self.entry_bits off1 = 16 + (r - (dicno << self.entry_bits)) * 2 dic = self.dicts[dicno] off2 = 16 + ord(dic[off1]) * 256 + ord(dic[off1+1]) blen = ord(dic[off2]) * 256 + ord(dic[off2+1]) slice = dic[off2+2:off2+2+(blen&0x7fff)] if blen & 0x8000: self.r += slice else: self._unpack(BitReader(slice), depth + 1) def unpack(self, data): self.r = '' self._unpack(BitReader(data)) return self.r class Sectionizer: def __init__(self, filename, ident): self.contents = file(filename, 'rb').read() self.header = self.contents[0:72] self.num_sections, = struct.unpack('>H', self.contents[76:78]) if self.header[0x3C:0x3C+8] != ident: raise ValueError('Invalid file format') self.sections = [] for i in xrange(self.num_sections): offset, a1,a2,a3,a4 = struct.unpack('>LBBBB', self.contents[78+i*8:78+i*8+8]) flags, val = a1, a2<<16|a3<<8|a4 self.sections.append( (offset, flags, val) ) def loadSection(self, section): if section + 1 == self.num_sections: end_off = len(self.contents) else: end_off = self.sections[section + 1][0] off = self.sections[section][0] return self.contents[off:end_off] def getSizeOfTrailingDataEntry(ptr, size): bitpos, result = 0, 0 while True: v = ord(ptr[size-1]) result |= (v & 0x7F) << bitpos bitpos += 7 size -= 1 if (v & 0x80) != 0 or (bitpos >= 28) or (size == 0): return result def getSizeOfTrailingDataEntries(ptr, size, flags): num = 0 flags >>= 1 while flags: if flags & 1: num += getSizeOfTrailingDataEntry(ptr, size - num) flags >>= 1 return num def unpackBook(input_file): sect = Sectionizer(input_file, 'BOOKMOBI') header = sect.loadSection(0) crypto_type, = struct.unpack('>H', header[0xC:0xC+2]) if crypto_type != 0: raise ValueError('The book is encrypted. Run mobidedrm first') if header[0:2] != 'DH': raise ValueError('invalid compression type') extra_flags, = struct.unpack('>L', header[0xF0:0xF4]) records, = struct.unpack('>H', header[0x8:0x8+2]) huffoff,huffnum = struct.unpack('>LL', header[0x70:0x78]) huffs = [sect.loadSection(i) for i in xrange(huffoff, huffoff+huffnum)] huff = HuffReader(huffs) def decompressSection(nr): data = sect.loadSection(nr) trail_size = getSizeOfTrailingDataEntries(data, len(data), extra_flags) return huff.unpack(data[0:len(data)-trail_size]) r = '' for i in xrange(1, records+1): r += decompressSection(i) return r def main(argv=sys.argv): print "MobiHuff v0.03" print " Copyright (c) 2008 The Dark Reverser " if len(sys.argv)!=3: print "" print "Description:" print " Unpacks the new mobipocket huffdic compression." print " This program works with unencrypted files only." print "Usage:" print " mobihuff.py infile.mobi outfile.html" return 1 else: infile = sys.argv[1] outfile = sys.argv[2] try: print "Decompressing...", result = unpackBook(infile) file(outfile, 'wb').write(result) print "done" except ValueError, e: print print "Error: %s" % e return 1 return 0 if __name__ == "__main__": sys.exit(main())