From 069f5412dd1ae1f4695a168ae17aded9fb4461fb Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Fri, 16 Aug 2013 22:36:04 +0200 Subject: webapp templates: add an anchor for file issues --- dedup/templates/binary.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'dedup') diff --git a/dedup/templates/binary.html b/dedup/templates/binary.html index 69eceef..46c4fa6 100644 --- a/dedup/templates/binary.html +++ b/dedup/templates/binary.html @@ -21,7 +21,7 @@

Note: Packages with yellow background are required to be installed when this package is installed.

{%- endif -%} {%- if issues -%} -

issues with particular files

+

issues with particular files

{%- for filename, issue in issues|dictsort(true) -%} -- cgit v1.2.3 From 1aa2948aaaa2a8e2474918ef57ab84a67d80e804 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Fri, 16 Aug 2013 22:45:18 +0200 Subject: make debian version_compare available in sql --- dedup/utils.py | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'dedup') diff --git a/dedup/utils.py b/dedup/utils.py index 2fae9fd..6fb233b 100644 --- a/dedup/utils.py +++ b/dedup/utils.py @@ -1,3 +1,5 @@ +from debian.debian_support import version_compare + def fetchiter(cursor): rows = cursor.fetchmany() while rows: @@ -5,3 +7,6 @@ def fetchiter(cursor): yield row rows = cursor.fetchmany() +def sql_add_version_compare(db): + db.create_collation("debian_version", version_compare) + db.create_function("debian_version_compare", 2, version_compare) -- cgit v1.2.3 From 3134b18dd8e4932b03b87453e6ee4b4a93b5595f Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Mon, 2 Sep 2013 09:30:05 +0200 Subject: importpkg: move library-like parts to dedup.debpkg --- dedup/debpkg.py | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ importpkg.py | 55 +++++++++---------------------------------------------- 2 files changed, 64 insertions(+), 46 deletions(-) create mode 100644 dedup/debpkg.py (limited to 'dedup') diff --git a/dedup/debpkg.py b/dedup/debpkg.py new file mode 100644 index 0000000..d8cc22f --- /dev/null +++ b/dedup/debpkg.py @@ -0,0 +1,55 @@ +from debian import deb822 + +from dedup.hashing import hash_file + +def process_control(control_contents): + """Parses the contents of a control file from a control.tar.gz of a Debian + package and returns a dictionary containing the fields relevant to dedup. + @type control_contents: bytes + @rtype: {str: object} + """ + control = deb822.Packages(control_contents) + package = control["package"].encode("ascii") + try: + source = control["source"].encode("ascii").split()[0] + except KeyError: + source = package + version = control["version"].encode("ascii") + architecture = control["architecture"].encode("ascii") + + depends = set(dep[0]["name"].encode("ascii") + for dep in control.relations.get("depends", ()) + if len(dep) == 1) + return dict(package=package, source=source, version=version, + architecture=architecture, depends=depends) + +class MultiHash(object): + def __init__(self, *hashes): + self.hashes = hashes + + def update(self, data): + for hasher in self.hashes: + hasher.update(data) + +def get_tar_hashes(tar, hash_functions): + """Given a TarFile read all regular files and compute all of the given hash + functions on each file. + @type tar: tarfile.TarFile + @param hash_functions: a sequence of parameter-less functions each creating a + new hashlib-like object + @rtype: gen((str, int, {str: str}} + @returns: an iterable of (filename, filesize, hashes) tuples where + hashes is a dict mapping hash function names to hash values + """ + + for elem in tar: + if not elem.isreg(): # excludes hard links as well + continue + hasher = MultiHash(*[func() for func in hash_functions]) + hasher = hash_file(hasher, tar.extractfile(elem)) + hashes = {} + for hashobj in hasher.hashes: + hashvalue = hashobj.hexdigest() + if hashvalue: + hashes[hashobj.name] = hashvalue + yield (elem.name, elem.size, hashes) diff --git a/importpkg.py b/importpkg.py index 1334dd6..54f6181 100755 --- a/importpkg.py +++ b/importpkg.py @@ -11,24 +11,16 @@ import sys import tarfile import zlib -from debian import deb822 import lzma import yaml from dedup.arreader import ArReader +from dedup.debpkg import process_control, get_tar_hashes from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \ - HashedStream, hash_file + HashedStream from dedup.compression import GzipDecompressor, DecompressedStream from dedup.image import GIFHash, PNGHash -class MultiHash(object): - def __init__(self, *hashes): - self.hashes = hashes - - def update(self, data): - for hasher in self.hashes: - hasher.update(data) - boring_sha512_hashes = set(( # "" "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e", @@ -57,37 +49,7 @@ def gifhash(): hashobj.name = "gif_sha512" return hashobj -def get_hashes(tar): - for elem in tar: - if not elem.isreg(): # excludes hard links as well - continue - hasher = MultiHash(sha512_nontrivial(), gziphash(), pnghash(), - gifhash()) - hasher = hash_file(hasher, tar.extractfile(elem)) - hashes = {} - for hashobj in hasher.hashes: - hashvalue = hashobj.hexdigest() - if hashvalue: - hashes[hashobj.name] = hashvalue - yield (elem.name, elem.size, hashes) - -def process_control(control_contents): - control = deb822.Packages(control_contents) - package = control["package"].encode("ascii") - try: - source = control["source"].encode("ascii").split()[0] - except KeyError: - source = package - version = control["version"].encode("ascii") - architecture = control["architecture"].encode("ascii") - - depends = set(dep[0]["name"].encode("ascii") - for dep in control.relations.get("depends", ()) - if len(dep) == 1) - return dict(package=package, source=source, version=version, - architecture=architecture, depends=depends) - -def process_package(filelike): +def process_package(filelike, hash_functions): af = ArReader(filelike) af.read_magic() state = "start" @@ -123,7 +85,7 @@ def process_package(filelike): continue if state != "control_file": raise ValueError("missing control file") - for name, size, hashes in get_hashes(tf): + for name, size, hashes in get_tar_hashes(tf, hash_functions): try: name = name.decode("utf8") except UnicodeDecodeError: @@ -133,9 +95,9 @@ def process_package(filelike): yield "commit" break -def process_package_with_hash(filelike, sha256hash): +def process_package_with_hash(filelike, hash_functions, sha256hash): hstream = HashedStream(filelike, hashlib.sha256()) - for elem in process_package(hstream): + for elem in process_package(hstream, hash_functions): if elem == "commit": while hstream.read(4096): pass @@ -150,10 +112,11 @@ def main(): parser.add_option("-H", "--hash", action="store", help="verify that stdin hash given sha256 hash") options, args = parser.parse_args() + hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash] if options.hash: - gen = process_package_with_hash(sys.stdin, options.hash) + gen = process_package_with_hash(sys.stdin, hash_functions, options.hash) else: - gen = process_package(sys.stdin) + gen = process_package(sys.stdin, hash_functions) yaml.safe_dump_all(gen, sys.stdout) if __name__ == "__main__": -- cgit v1.2.3 From 49cac8bdae0ec787372d227545411ef14905d6a8 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 4 Sep 2013 10:15:59 +0200 Subject: webapp: serve static files from /static --- dedup/templates/base.html | 4 ++-- webapp.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'dedup') diff --git a/dedup/templates/base.html b/dedup/templates/base.html index 62f4087..9dfb788 100644 --- a/dedup/templates/base.html +++ b/dedup/templates/base.html @@ -3,8 +3,8 @@ {% block title %}{% endblock %} - - + + {% block header %}{% endblock %} diff --git a/webapp.py b/webapp.py index 632b485..d5f076e 100755 --- a/webapp.py +++ b/webapp.py @@ -244,7 +244,7 @@ def main(): help="path to the sqlite3 database file") options, args = parser.parse_args() app = Application(sqlite3.connect(options.database)) - app = SharedDataMiddleware(app, {"/": ("dedup", "static")}) + app = SharedDataMiddleware(app, {"/static": ("dedup", "static")}) make_server("0.0.0.0", 8800, app).serve_forever() if __name__ == "__main__": -- cgit v1.2.3 From d228c0a4a5827325bca47d63ea287c7cb56537ea Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Thu, 3 Oct 2013 08:51:41 +0200 Subject: work around python-debian's #670679 --- dedup/debpkg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'dedup') diff --git a/dedup/debpkg.py b/dedup/debpkg.py index d8cc22f..2d67135 100644 --- a/dedup/debpkg.py +++ b/dedup/debpkg.py @@ -16,8 +16,8 @@ def process_control(control_contents): source = package version = control["version"].encode("ascii") architecture = control["architecture"].encode("ascii") - - depends = set(dep[0]["name"].encode("ascii") + # deb822 currently returns :any dependencies raw. see #670679 + depends = set(dep[0]["name"].split(u':', 1)[0].encode("ascii") for dep in control.relations.get("depends", ()) if len(dep) == 1) return dict(package=package, source=source, version=version, -- cgit v1.2.3 From 17597b5e828f9bbc9b0159102b173c284c23a140 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 19 Feb 2014 07:54:21 +0100 Subject: DecompressedHash should fail on trailing input Otherwise all files smaller than 10 bytes are successfully hashed to the hash of the empty input when using the GzipDecompressor. Reported-By: Olly Betts --- dedup/hashing.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'dedup') diff --git a/dedup/hashing.py b/dedup/hashing.py index 002eda8..5f015b2 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -49,9 +49,13 @@ class DecompressedHash(object): def hexdigest(self): if not hasattr(self.decompressor, "flush"): + if self.decompressor.unused_data: + raise ValueError("decompressor did not consume all data") return self.hashobj.hexdigest() tmpdecomp = self.decompressor.copy() data = tmpdecomp.flush() + if tmpdecomp.unused_data: + raise ValueError("decompressor did not consume all data") tmphash = self.hashobj.copy() tmphash.update(data) return tmphash.hexdigest() -- cgit v1.2.3 From d467a2a4e85d4b6f09bd2e3dc70466bfcc45a577 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 19 Feb 2014 14:19:56 +0100 Subject: GzipDecompressor: don't treat checksum as garbage trailer --- dedup/compression.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'dedup') diff --git a/dedup/compression.py b/dedup/compression.py index 869c49f..4ce258c 100644 --- a/dedup/compression.py +++ b/dedup/compression.py @@ -5,8 +5,11 @@ class GzipDecompressor(object): """An interface to gzip which is similar to bz2.BZ2Decompressor and lzma.LZMADecompressor.""" def __init__(self): + self.sawheader = False self.inbuffer = b"" self.decompressor = None + self.crc = 0 + self.size = 0 def decompress(self, data): """ @@ -16,6 +19,8 @@ class GzipDecompressor(object): while True: if self.decompressor: data = self.decompressor.decompress(data) + self.crc = zlib.crc32(data, self.crc) + self.size += len(data) unused_data = self.decompressor.unused_data if not unused_data: return data @@ -45,13 +50,20 @@ class GzipDecompressor(object): return b"" data = self.inbuffer[skip:] self.inbuffer = b"" + self.sawheader = True self.decompressor = zlib.decompressobj(-zlib.MAX_WBITS) @property def unused_data(self): if self.decompressor: return self.decompressor.unused_data + elif not self.sawheader: + return self.inbuffer else: + expect = struct.pack(" Date: Wed, 19 Feb 2014 14:21:20 +0100 Subject: blacklist content rather than hashes Otherwise the gzip hash cannot tell the empty stream and the compressed empty stream apart. --- dedup/hashing.py | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ importpkg.py | 15 +++++---------- 2 files changed, 55 insertions(+), 10 deletions(-) (limited to 'dedup') diff --git a/dedup/hashing.py b/dedup/hashing.py index 5f015b2..70f6268 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -1,3 +1,5 @@ +import itertools + class HashBlacklist(object): """Turn a hashlib-like object into a hash that returns None for some blacklisted hashes instead of the real hash value. @@ -29,6 +31,54 @@ class HashBlacklist(object): def copy(self): return HashBlacklist(self.hashobj.copy(), self.blacklist) +class HashBlacklistContent(object): + """Turn a hashlib-like object into a hash that returns None for some + blacklisted content instead of the real hash value. Unlike HashBlacklist, + not the output of the hash is considered, but its input.""" + + def __init__(self, hashobj, blacklist=(), maxlen=None): + """ + @param hashobj: a hashlib-like object + @param blacklist: an object providing __contains__. + hash inputs which are contained in the blacklist + are turned into None values + @param maxlen: the maximum length of a blacklisted input. + Defaults to max(map(len, blacklist)), so if it is absent, + the blacklist must support iteration. + """ + self.hashobj = hashobj + self.blacklist = blacklist + if maxlen is None: + # the chain avoids passing the empty sequence to max + maxlen = max(itertools.chain((0,), itertools.imap(len, blacklist))) + self.maxlen = maxlen + self.stored = "" + + @property + def name(self): + return self.hashobj.name + + def update(self, data): + if self.stored is not None: + self.stored += data + if len(self.stored) > self.maxlen: + self.stored = None + self.hashobj.update(data) + + def digest(self): + if self.stored is not None and self.stored in self.blacklist: + return None + return self.hashobj.digest() + + def hexdigest(self): + if self.stored is not None and self.stored in self.blacklist: + return None + return self.hashobj.hexdigest() + + def copy(self): + return HashBlacklistContent(self.hashobj.copy(), self.blacklist, + self.maxlen) + class DecompressedHash(object): """Apply a decompression function before the hash. This class provides the hashlib interface (update, hexdigest, copy) excluding digest and name.""" diff --git a/importpkg.py b/importpkg.py index 54f6181..cb16f97 100755 --- a/importpkg.py +++ b/importpkg.py @@ -16,26 +16,21 @@ import yaml from dedup.arreader import ArReader from dedup.debpkg import process_control, get_tar_hashes -from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \ - HashedStream +from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \ + HashBlacklistContent from dedup.compression import GzipDecompressor, DecompressedStream from dedup.image import GIFHash, PNGHash -boring_sha512_hashes = set(( - # "" - "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e", - # "\n" - "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09")) +boring_content = set(("", "\n")) def sha512_nontrivial(): - return HashBlacklist(hashlib.sha512(), boring_sha512_hashes) + return HashBlacklistContent(hashlib.sha512(), boring_content) def gziphash(): hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512()) hashobj = SuppressingHash(hashobj, (ValueError, zlib.error)) hashobj.name = "gzip_sha512" - # don't blacklist boring hashes for gzip to get gzip issues right - return hashobj + return HashBlacklistContent(hashobj, boring_content) def pnghash(): hashobj = PNGHash(hashlib.sha512()) -- cgit v1.2.3 From 8ccd5205f77276b333c56efb8271a0ddf11590a0 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sun, 23 Feb 2014 17:29:41 +0100 Subject: fix spelling mistake Reported-By: Stefan Kaltenbrunner --- dedup/templates/index.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'dedup') diff --git a/dedup/templates/index.html b/dedup/templates/index.html index 7c9000f..169027e 100644 --- a/dedup/templates/index.html +++ b/dedup/templates/index.html @@ -28,7 +28,7 @@ {% block content %}

Debian duplication detector

    -
  • To inspect a particlar binary package, go to
    binary/<packagename>
    Example: binary/git +
  • To inspect a particular binary package, go to
    binary/<packagename>
    Example: binary/git
filenameissue
{{ filename|e }}{{ issue|e }}