Huffman Encoded File Format

From DCppWiki

Jump to: navigation, search

Python

This are python function to decode HE3 compressed format used in MyList.DcLst file. To use this function:

he3_decoder('path/to/MyList.DcLst','path/to/Uncompressed.MyList.DcLst')

To further increase speed by up to 10x, run psyco module:

try:
    import psyco
    psyco.profile()
except ImportError:
    pass

Decompression routine:

"""
HE3 compression decoder
by Dody Suria Wijaya <dodysw@gmail.com 
Converted originally from he3.c by Eric Prevoteau <www@a2pb.gotdns.org>
"""
import struct, array, os
def he3_decoder(path_from, path_to):
    data = array.array('B')
    data.fromfile(file(path_from,'rb'), os.path.getsize(path_from))
    if not data[0:4].tostring().startswith('HE3\r'):
        raise Exception, 'Invalid HE3 header format. If this a HE3 file?'
    nb_output = struct.unpack('<L',data[5:9].tostring())[0]
    nb_couple = struct.unpack('<H',data[9:11].tostring())[0]
    max_len = 0 #max size of encoded pattern
    ttl_len = 0 #total size of all encoded patterns
    for pos in xrange(nb_couple):
        v = data[12 + pos*2]
        if v > max_len:
            max_len = v
        ttl_len += v
    decode_array = array.array('B', chr(0) * (1 << (max_len+1)))
    offset_pattern = 8 * (11+nb_couple*2) #position of the pattern block, it is just after the list of couples
    offset_encoded = offset_pattern + ((ttl_len+7) & ~7) #the encoded data are just after the pattern block (rounded to upper full byte)
    for pos in xrange(nb_couple):
        v_len = data[12 + pos*2] #the number of bit required
        res = 0
        for i in xrange(v_len):
            res = (res << 1) | ((data[offset_pattern / 8] >> (offset_pattern & 7)) & 1)
            offset_pattern += 1
        decode_array[(1 << v_len) + res] = data[11 + pos*2] # the character
    output = array.array('B', chr(0)*nb_output)
    for i in xrange(nb_output):
        cur_val = (data[offset_encoded/8] >> (offset_encoded & 7)) & 1 # get one bit
        offset_encoded += 1
        nb_bit_val = 1
        while decode_array[(1 << nb_bit_val) + cur_val] == 0:
            cur_val = (cur_val << 1) | ((data[offset_encoded/8] >> (offset_encoded & 7)) & 1)
            offset_encoded += 1
            nb_bit_val += 1
        output[i] = decode_array[(1 << nb_bit_val) + cur_val]
    output.tofile(file(path_to,'wb'))