From 5780f81524f0ae0957380b16db586090a181eaa0 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Mon, 19 Aug 2013 11:52:39 +0200 Subject: importpkg: don't blacklist boring gzip_sha512 hashes * In practise there are very few compressed files with trivial hashes. * Blacklisting these values results in false positives in the gzip issues. --- importpkg.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'importpkg.py') diff --git a/importpkg.py b/importpkg.py index 182ca01..1334dd6 100755 --- a/importpkg.py +++ b/importpkg.py @@ -42,7 +42,8 @@ def gziphash(): hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512()) hashobj = SuppressingHash(hashobj, (ValueError, zlib.error)) hashobj.name = "gzip_sha512" - return HashBlacklist(hashobj, boring_sha512_hashes) + # don't blacklist boring hashes for gzip to get gzip issues right + return hashobj def pnghash(): hashobj = PNGHash(hashlib.sha512()) -- cgit v1.2.3 From 3134b18dd8e4932b03b87453e6ee4b4a93b5595f Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Mon, 2 Sep 2013 09:30:05 +0200 Subject: importpkg: move library-like parts to dedup.debpkg --- dedup/debpkg.py | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ importpkg.py | 55 +++++++++---------------------------------------------- 2 files changed, 64 insertions(+), 46 deletions(-) create mode 100644 dedup/debpkg.py (limited to 'importpkg.py') diff --git a/dedup/debpkg.py b/dedup/debpkg.py new file mode 100644 index 0000000..d8cc22f --- /dev/null +++ b/dedup/debpkg.py @@ -0,0 +1,55 @@ +from debian import deb822 + +from dedup.hashing import hash_file + +def process_control(control_contents): + """Parses the contents of a control file from a control.tar.gz of a Debian + package and returns a dictionary containing the fields relevant to dedup. + @type control_contents: bytes + @rtype: {str: object} + """ + control = deb822.Packages(control_contents) + package = control["package"].encode("ascii") + try: + source = control["source"].encode("ascii").split()[0] + except KeyError: + source = package + version = control["version"].encode("ascii") + architecture = control["architecture"].encode("ascii") + + depends = set(dep[0]["name"].encode("ascii") + for dep in control.relations.get("depends", ()) + if len(dep) == 1) + return dict(package=package, source=source, version=version, + architecture=architecture, depends=depends) + +class MultiHash(object): + def __init__(self, *hashes): + self.hashes = hashes + + def update(self, data): + for hasher in self.hashes: + hasher.update(data) + +def get_tar_hashes(tar, hash_functions): + """Given a TarFile read all regular files and compute all of the given hash + functions on each file. + @type tar: tarfile.TarFile + @param hash_functions: a sequence of parameter-less functions each creating a + new hashlib-like object + @rtype: gen((str, int, {str: str}} + @returns: an iterable of (filename, filesize, hashes) tuples where + hashes is a dict mapping hash function names to hash values + """ + + for elem in tar: + if not elem.isreg(): # excludes hard links as well + continue + hasher = MultiHash(*[func() for func in hash_functions]) + hasher = hash_file(hasher, tar.extractfile(elem)) + hashes = {} + for hashobj in hasher.hashes: + hashvalue = hashobj.hexdigest() + if hashvalue: + hashes[hashobj.name] = hashvalue + yield (elem.name, elem.size, hashes) diff --git a/importpkg.py b/importpkg.py index 1334dd6..54f6181 100755 --- a/importpkg.py +++ b/importpkg.py @@ -11,24 +11,16 @@ import sys import tarfile import zlib -from debian import deb822 import lzma import yaml from dedup.arreader import ArReader +from dedup.debpkg import process_control, get_tar_hashes from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \ - HashedStream, hash_file + HashedStream from dedup.compression import GzipDecompressor, DecompressedStream from dedup.image import GIFHash, PNGHash -class MultiHash(object): - def __init__(self, *hashes): - self.hashes = hashes - - def update(self, data): - for hasher in self.hashes: - hasher.update(data) - boring_sha512_hashes = set(( # "" "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e", @@ -57,37 +49,7 @@ def gifhash(): hashobj.name = "gif_sha512" return hashobj -def get_hashes(tar): - for elem in tar: - if not elem.isreg(): # excludes hard links as well - continue - hasher = MultiHash(sha512_nontrivial(), gziphash(), pnghash(), - gifhash()) - hasher = hash_file(hasher, tar.extractfile(elem)) - hashes = {} - for hashobj in hasher.hashes: - hashvalue = hashobj.hexdigest() - if hashvalue: - hashes[hashobj.name] = hashvalue - yield (elem.name, elem.size, hashes) - -def process_control(control_contents): - control = deb822.Packages(control_contents) - package = control["package"].encode("ascii") - try: - source = control["source"].encode("ascii").split()[0] - except KeyError: - source = package - version = control["version"].encode("ascii") - architecture = control["architecture"].encode("ascii") - - depends = set(dep[0]["name"].encode("ascii") - for dep in control.relations.get("depends", ()) - if len(dep) == 1) - return dict(package=package, source=source, version=version, - architecture=architecture, depends=depends) - -def process_package(filelike): +def process_package(filelike, hash_functions): af = ArReader(filelike) af.read_magic() state = "start" @@ -123,7 +85,7 @@ def process_package(filelike): continue if state != "control_file": raise ValueError("missing control file") - for name, size, hashes in get_hashes(tf): + for name, size, hashes in get_tar_hashes(tf, hash_functions): try: name = name.decode("utf8") except UnicodeDecodeError: @@ -133,9 +95,9 @@ def process_package(filelike): yield "commit" break -def process_package_with_hash(filelike, sha256hash): +def process_package_with_hash(filelike, hash_functions, sha256hash): hstream = HashedStream(filelike, hashlib.sha256()) - for elem in process_package(hstream): + for elem in process_package(hstream, hash_functions): if elem == "commit": while hstream.read(4096): pass @@ -150,10 +112,11 @@ def main(): parser.add_option("-H", "--hash", action="store", help="verify that stdin hash given sha256 hash") options, args = parser.parse_args() + hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash] if options.hash: - gen = process_package_with_hash(sys.stdin, options.hash) + gen = process_package_with_hash(sys.stdin, hash_functions, options.hash) else: - gen = process_package(sys.stdin) + gen = process_package(sys.stdin, hash_functions) yaml.safe_dump_all(gen, sys.stdout) if __name__ == "__main__": -- cgit v1.2.3 From 332ac9eafb235443f163c606ced95dcbd615815e Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 19 Feb 2014 14:21:20 +0100 Subject: blacklist content rather than hashes Otherwise the gzip hash cannot tell the empty stream and the compressed empty stream apart. --- dedup/hashing.py | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ importpkg.py | 15 +++++---------- 2 files changed, 55 insertions(+), 10 deletions(-) (limited to 'importpkg.py') diff --git a/dedup/hashing.py b/dedup/hashing.py index 5f015b2..70f6268 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -1,3 +1,5 @@ +import itertools + class HashBlacklist(object): """Turn a hashlib-like object into a hash that returns None for some blacklisted hashes instead of the real hash value. @@ -29,6 +31,54 @@ class HashBlacklist(object): def copy(self): return HashBlacklist(self.hashobj.copy(), self.blacklist) +class HashBlacklistContent(object): + """Turn a hashlib-like object into a hash that returns None for some + blacklisted content instead of the real hash value. Unlike HashBlacklist, + not the output of the hash is considered, but its input.""" + + def __init__(self, hashobj, blacklist=(), maxlen=None): + """ + @param hashobj: a hashlib-like object + @param blacklist: an object providing __contains__. + hash inputs which are contained in the blacklist + are turned into None values + @param maxlen: the maximum length of a blacklisted input. + Defaults to max(map(len, blacklist)), so if it is absent, + the blacklist must support iteration. + """ + self.hashobj = hashobj + self.blacklist = blacklist + if maxlen is None: + # the chain avoids passing the empty sequence to max + maxlen = max(itertools.chain((0,), itertools.imap(len, blacklist))) + self.maxlen = maxlen + self.stored = "" + + @property + def name(self): + return self.hashobj.name + + def update(self, data): + if self.stored is not None: + self.stored += data + if len(self.stored) > self.maxlen: + self.stored = None + self.hashobj.update(data) + + def digest(self): + if self.stored is not None and self.stored in self.blacklist: + return None + return self.hashobj.digest() + + def hexdigest(self): + if self.stored is not None and self.stored in self.blacklist: + return None + return self.hashobj.hexdigest() + + def copy(self): + return HashBlacklistContent(self.hashobj.copy(), self.blacklist, + self.maxlen) + class DecompressedHash(object): """Apply a decompression function before the hash. This class provides the hashlib interface (update, hexdigest, copy) excluding digest and name.""" diff --git a/importpkg.py b/importpkg.py index 54f6181..cb16f97 100755 --- a/importpkg.py +++ b/importpkg.py @@ -16,26 +16,21 @@ import yaml from dedup.arreader import ArReader from dedup.debpkg import process_control, get_tar_hashes -from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \ - HashedStream +from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \ + HashBlacklistContent from dedup.compression import GzipDecompressor, DecompressedStream from dedup.image import GIFHash, PNGHash -boring_sha512_hashes = set(( - # "" - "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e", - # "\n" - "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09")) +boring_content = set(("", "\n")) def sha512_nontrivial(): - return HashBlacklist(hashlib.sha512(), boring_sha512_hashes) + return HashBlacklistContent(hashlib.sha512(), boring_content) def gziphash(): hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512()) hashobj = SuppressingHash(hashobj, (ValueError, zlib.error)) hashobj.name = "gzip_sha512" - # don't blacklist boring hashes for gzip to get gzip issues right - return hashobj + return HashBlacklistContent(hashobj, boring_content) def pnghash(): hashobj = PNGHash(hashlib.sha512()) -- cgit v1.2.3 From 8d4c5512edbdcdd1063a7e6508f398a5a57981be Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sun, 23 Feb 2014 18:19:35 +0100 Subject: spell check comments --- dedup/hashing.py | 2 +- dedup/image.py | 2 +- importpkg.py | 4 ++-- webapp.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'importpkg.py') diff --git a/dedup/hashing.py b/dedup/hashing.py index 70f6268..a8a46c7 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -115,7 +115,7 @@ class DecompressedHash(object): class SuppressingHash(object): """A hash that silences exceptions from the update and hexdigest methods of - a hashlib-like object. If an exception has occured, hexdigest always + a hashlib-like object. If an exception has occurred, hexdigest always returns None.""" def __init__(self, hashobj, exceptions=()): """ diff --git a/dedup/image.py b/dedup/image.py index c1f2de0..ef17989 100644 --- a/dedup/image.py +++ b/dedup/image.py @@ -4,7 +4,7 @@ import struct import PIL.Image class ImageHash(object): - """A hash on the contents of an image datat type supported by PIL. This + """A hash on the contents of an image data type supported by PIL. This disregards mode, depth and meta information. Note that due to limitations in PIL and the image format (interlacing) the full contents are stored and decoded in hexdigest.""" diff --git a/importpkg.py b/importpkg.py index cb16f97..aeccda5 100755 --- a/importpkg.py +++ b/importpkg.py @@ -1,7 +1,7 @@ #!/usr/bin/python -"""This tool reads a debian package from stdin and emits a yaml stream on +"""This tool reads a Debian package from stdin and emits a yaml stream on stdout. It does not access a database. Therefore it can be run in parallel and -on multiple machines. The generated yaml conatins multiple documents. The first +on multiple machines. The generated yaml contains multiple documents. The first document contains package metadata. Then a document is emitted for each file. And finally a document consisting of the string "commit" is emitted.""" diff --git a/webapp.py b/webapp.py index fd6d685..2fd69bb 100755 --- a/webapp.py +++ b/webapp.py @@ -151,7 +151,7 @@ class Application(object): return html_response(package_template.render(params)) def compute_comparison(self, pid1, pid2): - """Compute a sequence of comparison objects ordery by the size of the + """Compute a sequence of comparison objects ordered by the size of the object in the first package. Each element of the sequence is a dict defining the following keys: * filenames: A set of filenames in package 1 (pid1) all referring to -- cgit v1.2.3