diff options
author | Helmut Grohne <helmut@subdivi.de> | 2013-02-20 15:27:40 +0100 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2013-02-20 15:27:40 +0100 |
commit | 0c7b8f30143923571a950d86c79c2b9522e07487 (patch) | |
tree | 449b81e2a1475f1562a5782096e2e0f05d9ac9b6 | |
download | debian-dedup-0c7b8f30143923571a950d86c79c2b9522e07487.tar.gz |
first prototype
-rwxr-xr-x | test.py | 146 | ||||
-rwxr-xr-x | webapp.py | 90 |
2 files changed, 236 insertions, 0 deletions
@@ -0,0 +1,146 @@ +#!/usr/bin/python +""" +CREATE TABLE content (package TEXT, version TEXT, architecture TEXT, filename TEXT, size INTEGER, hash TEXT); +CREATE INDEX content_package_index ON content (package); +CREATE INDEX content_hash_index ON content (hash); +""" + +import hashlib +import os +import re +import sqlite3 +import struct +import sys +import tarfile + +import apt_pkg +import lzma + +apt_pkg.init() + +class ArReader(object): + global_magic = b"!<arch>\n" + file_magic = b"`\n" + + def __init__(self, fileobj, membertest): + self.fileobj = fileobj + self.membertest = membertest + self.remaining = None + + def skip(self, length): + while length: + data = self.fileobj.read(min(4096, length)) + if not data: + raise ValueError("archive truncated") + length -= len(data) + + def skiptillmember(self): + data = self.fileobj.read(len(self.global_magic)) + if data != self.global_magic: + raise ValueError("ar global header not found") + while True: + file_header = self.fileobj.read(60) + if not file_header: + raise ValueError("end of archive found") + parts = struct.unpack("16s 12s 6s 6s 8s 10s 2s", file_header) + parts = [p.rstrip(" ") for p in parts] + if parts.pop() != self.file_magic: + print(repr(file_header)) + raise ValueError("ar file header not found") + name = parts[0] + length = int(parts[5]) + if self.membertest(name): + self.remaining = length + return name, length + self.skip(length + length % 2) + + def read(self, length=None): + if length is None: + length = self.remaining + else: + length = min(self.remaining, length) + data = self.fileobj.read(length) + self.remaining -= len(data) + return data + + def close(self): + self.fileobj.close() + +class XzStream(object): + blocksize = 65536 + + def __init__(self, fileobj): + self.fileobj = fileobj + self.decomp = lzma.LZMADecompressor() + self.buff = b"" + + def read(self, length): + data = True + while True: + if len(self.buff) >= length: + ret = self.buff[:length] + self.buff = self.buff[length:] + return ret + elif not data: # read EOF in last iteration + ret = self.buff + self.buff = b"" + return ret + data = self.fileobj.read(self.blocksize) + if data: + self.buff += self.decomp.decompress(data) + else: + self.buff += self.decomp.flush() + +def hash_file(hashobj, filelike, blocksize=65536): + data = filelike.read(blocksize) + while data: + hashobj.update(data) + data = filelike.read(blocksize) + return hashobj + +def get_hashes(filelike): + af = ArReader(filelike, lambda name: name.startswith("data.tar")) + name, membersize = af.skiptillmember() + if name == "data.tar.gz": + tf = tarfile.open(fileobj=af, mode="r|gz") + elif name == "data.tar.bz2": + tf = tarfile.open(fileobj=af, mode="r|bz2") + elif name == "data.tar.xz": + zf = XzStream(af) + tf = tarfile.open(fileobj=zf, mode="r|") + else: + raise ValueError("unsupported compression %r" % name) + for elem in tf: + if elem.size == 0: # boring + continue + if not elem.isreg(): # excludes hard links as well + continue + hasher = hash_file(hashlib.sha512(), tf.extractfile(elem)) + yield (elem.name, elem.size, hasher.hexdigest()) + +def main(): + filename = sys.argv[1] + match = re.match("(?:.*/)?(?P<name>[^_]+)_(?P<version>[^_]+)_(?P<architecture>[^_.]+)\\.deb$", filename) + package, version, architecture = match.groups() + db = sqlite3.connect("test.sqlite3") + cur = db.cursor() + + cur.execute("SELECT version FROM content WHERE package = ?;", (package,)) + versions = [tpl[0] for tpl in cur.fetchall()] + versions.append(version) + versions.sort(cmp=apt_pkg.version_compare) + if versions[-1] != version: + return # not the newest version + + cur.execute("DELETE FROM content WHERE package = ?;", (package,)) + #cur.execute("DELETE FROM content WHERE package = ? AND version = ? AND architecture = ?;", + # (package, version, architecture)) + with open(filename) as pkg: + for name, size, hexhash in get_hashes(pkg): + name = name.decode("utf8") + cur.execute("INSERT INTO content (package, version, architecture, filename, size, hash) VALUES (?, ?, ?, ?, ?, ?);", + (package, version, architecture, name, size, hexhash)) + db.commit() + +if __name__ == "__main__": + main() diff --git a/webapp.py b/webapp.py new file mode 100755 index 0000000..ec9c079 --- /dev/null +++ b/webapp.py @@ -0,0 +1,90 @@ +#!/usr/bin/python + +import sqlite3 +from wsgiref.simple_server import make_server + +from werkzeug.debug import DebuggedApplication +from werkzeug.exceptions import HTTPException, NotFound +from werkzeug.routing import Map, Rule +from werkzeug.wrappers import Request, Response + +def format_size(size): + size = float(size) + fmt = "%d B" + if size >= 1024: + size /= 1024 + fmt = "%.1f KB" + if size >= 1024: + size /= 1024 + fmt = "%.1f MB" + if size >= 1024: + size /= 1024 + fmt = "%.1f GB" + return fmt % size + +class Application(object): + def __init__(self): + self.db = sqlite3.connect("test.sqlite3") + self.cur = self.db.cursor() + self.routingmap = Map([ + Rule("/<package>", methods=("GET",), + endpoint="package"), + ]) + + @Request.application + def __call__(self, request): + mapadapter = self.routingmap.bind_to_environ(request.environ) + try: + endpoint, args = mapadapter.match() + assert endpoint == "package" + return self.show_package(args["package"]) + except HTTPException as e: + return e + + def show_package(self, package): + self.cur.execute("SELECT version, architecture FROM content WHERE package = ? LIMIT 1;", (package,)) + row = self.cur.fetchone() + if not row: + raise NotFound() + version, architecture = row + self.cur.execute("SELECT count(filename) FROM content WHERE package = ?;", (package,)) + num_files = self.cur.fetchone()[0] + self.cur.execute("SELECT sum(size) FROM content WHERE package = ?;", (package,)) + total_size = self.cur.fetchone()[0] + content = "<p>Version: %s</p><p>Architecture: %s</p>" % (version, architecture) + content += "<p>Number of files: %d</p>" % num_files + content += "<p>Total size: %s</p>" % format_size(total_size) + + shared = dict() + self.cur.execute("SELECT a.filename, a.hash, a.size, b.package FROM content AS a JOIN content AS b ON a.hash = b.hash WHERE a.package = ? AND (a.filename != b.filename OR b.package != ?);", (package, package)) + for afile, hashval, size, bpkg in self.cur.fetchall(): + shared.setdefault(bpkg, dict()).setdefault(hashval, (size, set()))[1].add(afile) + if shared: + sharedstats = [] + mapping = shared.pop(package, dict()) + if mapping: + duplicate = sum(len(files) for _, files in mapping.values()) + savable = sum(size * (len(files) - 1) for size, files in mapping.values()) + sharedstats.append(("self", duplicate, savable)) + for pkg, mapping in shared.items(): + pkglink = '<a href="%s">%s</a>' % (pkg, pkg) + duplicate = sum(len(files) for _, files in mapping.values()) + savable = sum(size * len(files) for size, files in mapping.values()) + sharedstats.append((pkglink, duplicate, savable)) + sharedstats.sort(key=lambda row: row[2], reverse=True) + content += "<table border='1'><tr><th>package</th><th>files shared</th><th>data shared</th></tr>" + for pkg, duplicate, savable in sharedstats: + content += "<tr><td>%s</td><td>%d (%d%%)</td><td>%s (%d%%)</td></tr>" % (pkg, duplicate, 100. * duplicate / num_files, format_size(savable), 100. * savable / total_size) + content += "</table>" + + r = Response(content_type="text/html") + r.data = "<html><head><title>duplication of %(package)s</title></head><body><h1>%(package)s</h1>%(content)s</body></html>" % dict(package=package, content=content) + return r + +def main(): + app = Application() + app = DebuggedApplication(app, evalex=True) + make_server("localhost", 8800, app).serve_forever() + +if __name__ == "__main__": + main() |