From d228c0a4a5827325bca47d63ea287c7cb56537ea Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Thu, 3 Oct 2013 08:51:41 +0200 Subject: work around python-debian's #670679 --- dedup/debpkg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'dedup') diff --git a/dedup/debpkg.py b/dedup/debpkg.py index d8cc22f..2d67135 100644 --- a/dedup/debpkg.py +++ b/dedup/debpkg.py @@ -16,8 +16,8 @@ def process_control(control_contents): source = package version = control["version"].encode("ascii") architecture = control["architecture"].encode("ascii") - - depends = set(dep[0]["name"].encode("ascii") + # deb822 currently returns :any dependencies raw. see #670679 + depends = set(dep[0]["name"].split(u':', 1)[0].encode("ascii") for dep in control.relations.get("depends", ()) if len(dep) == 1) return dict(package=package, source=source, version=version, -- cgit v1.2.3 From 17597b5e828f9bbc9b0159102b173c284c23a140 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 19 Feb 2014 07:54:21 +0100 Subject: DecompressedHash should fail on trailing input Otherwise all files smaller than 10 bytes are successfully hashed to the hash of the empty input when using the GzipDecompressor. Reported-By: Olly Betts --- dedup/hashing.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'dedup') diff --git a/dedup/hashing.py b/dedup/hashing.py index 002eda8..5f015b2 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -49,9 +49,13 @@ class DecompressedHash(object): def hexdigest(self): if not hasattr(self.decompressor, "flush"): + if self.decompressor.unused_data: + raise ValueError("decompressor did not consume all data") return self.hashobj.hexdigest() tmpdecomp = self.decompressor.copy() data = tmpdecomp.flush() + if tmpdecomp.unused_data: + raise ValueError("decompressor did not consume all data") tmphash = self.hashobj.copy() tmphash.update(data) return tmphash.hexdigest() -- cgit v1.2.3 From d467a2a4e85d4b6f09bd2e3dc70466bfcc45a577 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 19 Feb 2014 14:19:56 +0100 Subject: GzipDecompressor: don't treat checksum as garbage trailer --- dedup/compression.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'dedup') diff --git a/dedup/compression.py b/dedup/compression.py index 869c49f..4ce258c 100644 --- a/dedup/compression.py +++ b/dedup/compression.py @@ -5,8 +5,11 @@ class GzipDecompressor(object): """An interface to gzip which is similar to bz2.BZ2Decompressor and lzma.LZMADecompressor.""" def __init__(self): + self.sawheader = False self.inbuffer = b"" self.decompressor = None + self.crc = 0 + self.size = 0 def decompress(self, data): """ @@ -16,6 +19,8 @@ class GzipDecompressor(object): while True: if self.decompressor: data = self.decompressor.decompress(data) + self.crc = zlib.crc32(data, self.crc) + self.size += len(data) unused_data = self.decompressor.unused_data if not unused_data: return data @@ -45,13 +50,20 @@ class GzipDecompressor(object): return b"" data = self.inbuffer[skip:] self.inbuffer = b"" + self.sawheader = True self.decompressor = zlib.decompressobj(-zlib.MAX_WBITS) @property def unused_data(self): if self.decompressor: return self.decompressor.unused_data + elif not self.sawheader: + return self.inbuffer else: + expect = struct.pack(" Date: Wed, 19 Feb 2014 14:21:20 +0100 Subject: blacklist content rather than hashes Otherwise the gzip hash cannot tell the empty stream and the compressed empty stream apart. --- dedup/hashing.py | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ importpkg.py | 15 +++++---------- 2 files changed, 55 insertions(+), 10 deletions(-) (limited to 'dedup') diff --git a/dedup/hashing.py b/dedup/hashing.py index 5f015b2..70f6268 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -1,3 +1,5 @@ +import itertools + class HashBlacklist(object): """Turn a hashlib-like object into a hash that returns None for some blacklisted hashes instead of the real hash value. @@ -29,6 +31,54 @@ class HashBlacklist(object): def copy(self): return HashBlacklist(self.hashobj.copy(), self.blacklist) +class HashBlacklistContent(object): + """Turn a hashlib-like object into a hash that returns None for some + blacklisted content instead of the real hash value. Unlike HashBlacklist, + not the output of the hash is considered, but its input.""" + + def __init__(self, hashobj, blacklist=(), maxlen=None): + """ + @param hashobj: a hashlib-like object + @param blacklist: an object providing __contains__. + hash inputs which are contained in the blacklist + are turned into None values + @param maxlen: the maximum length of a blacklisted input. + Defaults to max(map(len, blacklist)), so if it is absent, + the blacklist must support iteration. + """ + self.hashobj = hashobj + self.blacklist = blacklist + if maxlen is None: + # the chain avoids passing the empty sequence to max + maxlen = max(itertools.chain((0,), itertools.imap(len, blacklist))) + self.maxlen = maxlen + self.stored = "" + + @property + def name(self): + return self.hashobj.name + + def update(self, data): + if self.stored is not None: + self.stored += data + if len(self.stored) > self.maxlen: + self.stored = None + self.hashobj.update(data) + + def digest(self): + if self.stored is not None and self.stored in self.blacklist: + return None + return self.hashobj.digest() + + def hexdigest(self): + if self.stored is not None and self.stored in self.blacklist: + return None + return self.hashobj.hexdigest() + + def copy(self): + return HashBlacklistContent(self.hashobj.copy(), self.blacklist, + self.maxlen) + class DecompressedHash(object): """Apply a decompression function before the hash. This class provides the hashlib interface (update, hexdigest, copy) excluding digest and name.""" diff --git a/importpkg.py b/importpkg.py index 54f6181..cb16f97 100755 --- a/importpkg.py +++ b/importpkg.py @@ -16,26 +16,21 @@ import yaml from dedup.arreader import ArReader from dedup.debpkg import process_control, get_tar_hashes -from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \ - HashedStream +from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \ + HashBlacklistContent from dedup.compression import GzipDecompressor, DecompressedStream from dedup.image import GIFHash, PNGHash -boring_sha512_hashes = set(( - # "" - "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e", - # "\n" - "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09")) +boring_content = set(("", "\n")) def sha512_nontrivial(): - return HashBlacklist(hashlib.sha512(), boring_sha512_hashes) + return HashBlacklistContent(hashlib.sha512(), boring_content) def gziphash(): hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512()) hashobj = SuppressingHash(hashobj, (ValueError, zlib.error)) hashobj.name = "gzip_sha512" - # don't blacklist boring hashes for gzip to get gzip issues right - return hashobj + return HashBlacklistContent(hashobj, boring_content) def pnghash(): hashobj = PNGHash(hashlib.sha512()) -- cgit v1.2.3 From 8ccd5205f77276b333c56efb8271a0ddf11590a0 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sun, 23 Feb 2014 17:29:41 +0100 Subject: fix spelling mistake Reported-By: Stefan Kaltenbrunner --- dedup/templates/index.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'dedup') diff --git a/dedup/templates/index.html b/dedup/templates/index.html index 7c9000f..169027e 100644 --- a/dedup/templates/index.html +++ b/dedup/templates/index.html @@ -28,7 +28,7 @@ {% block content %}

Debian duplication detector

    -
  • To inspect a particlar binary package, go to
    binary/<packagename>
    Example: binary/git +
  • To inspect a particular binary package, go to
    binary/<packagename>
    Example: binary/git