diff options
-rw-r--r-- | dedup/compression.py | 15 |
1 files changed, 15 insertions, 0 deletions
diff --git a/dedup/compression.py b/dedup/compression.py index 869c49f..4ce258c 100644 --- a/dedup/compression.py +++ b/dedup/compression.py @@ -5,8 +5,11 @@ class GzipDecompressor(object): """An interface to gzip which is similar to bz2.BZ2Decompressor and lzma.LZMADecompressor.""" def __init__(self): + self.sawheader = False self.inbuffer = b"" self.decompressor = None + self.crc = 0 + self.size = 0 def decompress(self, data): """ @@ -16,6 +19,8 @@ class GzipDecompressor(object): while True: if self.decompressor: data = self.decompressor.decompress(data) + self.crc = zlib.crc32(data, self.crc) + self.size += len(data) unused_data = self.decompressor.unused_data if not unused_data: return data @@ -45,13 +50,20 @@ class GzipDecompressor(object): return b"" data = self.inbuffer[skip:] self.inbuffer = b"" + self.sawheader = True self.decompressor = zlib.decompressobj(-zlib.MAX_WBITS) @property def unused_data(self): if self.decompressor: return self.decompressor.unused_data + elif not self.sawheader: + return self.inbuffer else: + expect = struct.pack("<ll", self.crc, self.size) + if self.inbuffer.startswith(expect) and \ + self.inbuffer[len(expect):].replace("\0", "") == "": + return b"" return self.inbuffer def flush(self): @@ -67,6 +79,9 @@ class GzipDecompressor(object): new.inbuffer = self.inbuffer if self.decompressor: new.decompressor = self.decompressor.copy() + new.sawheader = self.sawheader + new.crc = self.crc + new.size = self.size return new class DecompressedStream(object): |