summaryrefslogtreecommitdiff
path: root/dedup/compression.py
blob: 4ce258c499bff6ab0519f5120584bc3a60e666be (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import struct
import zlib

class GzipDecompressor(object):
    """An interface to gzip which is similar to bz2.BZ2Decompressor and
    lzma.LZMADecompressor."""
    def __init__(self):
        self.sawheader = False
        self.inbuffer = b""
        self.decompressor = None
        self.crc = 0
        self.size = 0

    def decompress(self, data):
        """
        @raises ValueError: if no gzip magic is found
        @raises zlib.error: from zlib invocations
        """
        while True:
            if self.decompressor:
                data = self.decompressor.decompress(data)
                self.crc = zlib.crc32(data, self.crc)
                self.size += len(data)
                unused_data = self.decompressor.unused_data
                if not unused_data:
                    return data
                self.decompressor = None
                return data + self.decompress(unused_data)
            self.inbuffer += data
            skip = 10
            if len(self.inbuffer) < skip:
                return b""
            if not self.inbuffer.startswith(b"\037\213\010"):
                raise ValueError("gzip magic not found")
            flag = ord(self.inbuffer[3])
            if flag & 4:
                if len(self.inbuffer) < skip + 2:
                    return b""
                length, = struct.unpack("<H", self.inbuffer[skip:skip+2])
                skip += 2 + length
            for field in (8, 16):
                if flag & field:
                    length = self.inbuffer.find(b"\0", skip)
                    if length < 0:
                        return b""
                    skip = length + 1
            if flag & 2:
                skip += 2
            if len(self.inbuffer) < skip:
                return b""
            data = self.inbuffer[skip:]
            self.inbuffer = b""
            self.sawheader = True
            self.decompressor = zlib.decompressobj(-zlib.MAX_WBITS)

    @property
    def unused_data(self):
        if self.decompressor:
            return self.decompressor.unused_data
        elif not self.sawheader:
            return self.inbuffer
        else:
            expect = struct.pack("<ll", self.crc, self.size)
            if self.inbuffer.startswith(expect) and \
                    self.inbuffer[len(expect):].replace("\0", "") == "":
                return b""
            return self.inbuffer

    def flush(self):
        """
        @raises zlib.error: from zlib invocations
        """
        if not self.decompressor:
            return b""
        return self.decompressor.flush()

    def copy(self):
        new = GzipDecompressor()
        new.inbuffer = self.inbuffer
        if self.decompressor:
            new.decompressor = self.decompressor.copy()
        new.sawheader = self.sawheader
        new.crc = self.crc
        new.size = self.size
        return new

class DecompressedStream(object):
    """Turn a readable file-like into a decompressed file-like. Te only part
    of being file-like consists of the read(size) method in both cases."""
    blocksize = 65536

    def __init__(self, fileobj, decompressor):
        """
        @param fileobj: a file-like object providing read(size)
        @param decompressor: a bz2.BZ2Decompressor or lzma.LZMADecompressor
            like object providing methods decompress and flush and an
            attribute unused_data
        """
        self.fileobj = fileobj
        self.decompressor = decompressor
        self.buff = b""

    def read(self, length=None):
        data = True
        while True:
            if length is not None and len(self.buff) >= length:
                ret = self.buff[:length]
                self.buff = self.buff[length:]
                return ret
            elif not data: # read EOF in last iteration
                ret = self.buff
                self.buff = b""
                return ret
            data = self.fileobj.read(self.blocksize)
            if data:
                self.buff += self.decompressor.decompress(data)
            else:
                self.buff += self.decompressor.flush()