summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2013-07-15 07:21:09 +0200
committerHelmut Grohne <helmut@subdivi.de>2013-07-15 07:21:09 +0200
commit6205c89b1e289f04dcea1e6e32fafa6357abf063 (patch)
tree021281e49a05b2533c671b1648c0bb6b856b06c4
parent69c7a945120e95b5a389c2708994e907c7ca82e0 (diff)
parent14020d53fd4853aa72f159885edad1dcbfce2ee7 (diff)
downloaddebian-dedup-6205c89b1e289f04dcea1e6e32fafa6357abf063.tar.gz
Merge branch 'packageid'
-rw-r--r--README8
-rwxr-xr-xautoimport.py4
-rwxr-xr-xreadyaml.py35
-rw-r--r--schema.sql12
-rwxr-xr-xupdate_sharing.py18
-rwxr-xr-xwebapp.py59
6 files changed, 72 insertions, 64 deletions
diff --git a/README b/README
index 55659dc..44b086a 100644
--- a/README
+++ b/README
@@ -38,17 +38,17 @@ SQL database by hand. Here are some example queries.
Finding the 100 largest files shared with multiple packages.
- SELECT a.package, a.filename, b.package, b.filename, a.size FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hash = hb.hash JOIN content AS b ON b.id = hb.cid WHERE (a.package != b.package OR a.filename != b.filename) ORDER BY a.size DESC LIMIT 100;
+ SELECT pa.name, a.filename, pb.name, b.filename, a.size FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hash = hb.hash JOIN content AS b ON b.id = hb.cid JOIN package AS pa ON b.pid = pa.id JOIN package AS pb ON b.pid = pb.id WHERE (a.pid != b.pid OR a.filename != b.filename) ORDER BY a.size DESC LIMIT 100;
Finding those top 100 files that save most space when being reduced to only
one copy in the archive.
- SELECT hash, sum(size)-min(size), count(*), count(distinct package) FROM content JOIN hash ON content.id = hash.cid WHERE hash.function = "sha512" GROUP BY hash ORDER BY sum(size)-min(size) DESC LIMIT 100;
+ SELECT hash, sum(size)-min(size), count(*), count(distinct pid) FROM content JOIN hash ON content.id = hash.cid WHERE hash.function = "sha512" GROUP BY hash ORDER BY sum(size)-min(size) DESC LIMIT 100;
Finding PNG images that do not carry a .png file extension.
- SELECT package, filename, size FROM content JOIN hash ON content.id = hash.cid WHERE function = "image_sha512" AND filename NOT LIKE "%.png";
+ SELECT package.name, content.filename, content.size FROM content JOIN hash ON content.id = hash.cid JOIN package ON content.pid = package.id WHERE function = "image_sha512" AND filename NOT LIKE "%.png";
Finding .gz files which either are not gziped or contain errors.
- SELECT content.package, content.filename FROM content WHERE filename LIKE "%.gz" AND (SELECT count(*) FROM hash WHERE hash.cid = content.id AND hash.function = "gzip_sha512") = 0;
+ SELECT package.name, content.filename FROM content JOIN package ON content.pid = package.id WHERE filename LIKE "%.gz" AND (SELECT count(*) FROM hash WHERE hash.cid = content.id AND hash.function = "gzip_sha512") = 0;
diff --git a/autoimport.py b/autoimport.py
index d326d61..694ffeb 100755
--- a/autoimport.py
+++ b/autoimport.py
@@ -93,7 +93,7 @@ def main():
process_file(pkgs, d)
print("reading database")
- cur.execute("SELECT package, version FROM package;")
+ cur.execute("SELECT name, version FROM package;")
knownpkgs = dict((row[0], row[1]) for row in cur.fetchall())
distpkgs = set(pkgs.keys())
if options.new:
@@ -126,7 +126,7 @@ def main():
if options.prune:
delpkgs = knownpkgs - distpkgs
print("clearing packages %s" % " ".join(delpkgs))
- cur.executemany("DELETE FROM package WHERE package = ?;",
+ cur.executemany("DELETE FROM package WHERE name = ?;",
((pkg,) for pkg in delpkgs))
# Tables content, dependency and sharing will also be pruned
# due to ON DELETE CASCADE clauses.
diff --git a/readyaml.py b/readyaml.py
index e2f3bb3..bb8ac54 100755
--- a/readyaml.py
+++ b/readyaml.py
@@ -14,29 +14,36 @@ def readyaml(db, stream):
gen = yaml.safe_load_all(stream)
metadata = next(gen)
package = metadata["package"]
- cur.execute("SELECT version FROM package WHERE package = ?;",
+ cur.execute("SELECT id, version FROM package WHERE name = ?;",
(package,))
row = cur.fetchone()
- if row and version_compare(row[0], metadata["version"]) > 0:
- return
+ if row:
+ pid, version = row
+ if version_compare(version, metadata["version"]) > 0:
+ return
+ else:
+ pid = None
cur.execute("BEGIN;")
- cur.execute("DELETE FROM content WHERE package = ?;",
- (package,))
- cur.execute("INSERT OR REPLACE INTO package (package, version, architecture, source) VALUES (?, ?, ?, ?);",
- (package, metadata["version"], metadata["architecture"],
- metadata["source"]))
- cur.execute("DELETE FROM dependency WHERE package = ?;",
- (package,))
- cur.executemany("INSERT INTO dependency (package, required) VALUES (?, ?);",
- ((package, dep) for dep in metadata["depends"]))
+ if pid is not None:
+ cur.execute("DELETE FROM content WHERE pid = ?;", (pid,))
+ cur.execute("DELETE FROM dependency WHERE pid = ?;", (pid,))
+ cur.execute("UPDATE package SET version = ?, architecture = ?, source = ? WHERE id = ?;",
+ (metadata["version"], metadata["architecture"], metadata["source"], pid))
+ else:
+ cur.execute("INSERT INTO package (name, version, architecture, source) VALUES (?, ?, ?, ?);",
+ (package, metadata["version"], metadata["architecture"],
+ metadata["source"]))
+ pid = cur.lastrowid
+ cur.executemany("INSERT INTO dependency (pid, required) VALUES (?, ?);",
+ ((pid, dep) for dep in metadata["depends"]))
for entry in gen:
if entry == "commit":
db.commit()
return
- cur.execute("INSERT INTO content (package, filename, size) VALUES (?, ?, ?);",
- (package, entry["name"], entry["size"]))
+ cur.execute("INSERT INTO content (pid, filename, size) VALUES (?, ?, ?);",
+ (pid, entry["name"], entry["size"]))
cid = cur.lastrowid
cur.executemany("INSERT INTO hash (cid, function, hash) VALUES (?, ?, ?);",
((cid, func, hexhash)
diff --git a/schema.sql b/schema.sql
index 94f0317..b839a51 100644
--- a/schema.sql
+++ b/schema.sql
@@ -1,11 +1,11 @@
-CREATE TABLE package (package TEXT PRIMARY KEY, version TEXT, architecture TEXT, source TEXT);
-CREATE TABLE content (id INTEGER PRIMARY KEY, package TEXT, filename TEXT, size INTEGER, FOREIGN KEY (package) REFERENCES package(package) ON DELETE CASCADE);
+CREATE TABLE package (id INTEGER PRIMARY KEY, name TEXT UNIQUE, version TEXT, architecture TEXT, source TEXT);
+CREATE TABLE content (id INTEGER PRIMARY KEY, pid INTEGER, filename TEXT, size INTEGER, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE);
CREATE TABLE hash (cid INTEGER, function TEXT, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE);
-CREATE TABLE dependency (package TEXT, required TEXT, FOREIGN KEY (package) REFERENCES package(package) ON DELETE CASCADE);
-CREATE INDEX content_package_index ON content (package);
+CREATE TABLE dependency (pid INTEGER, required TEXT, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE);
+CREATE INDEX content_package_index ON content (pid);
CREATE INDEX hash_cid_index ON hash (cid);
CREATE INDEX hash_hash_index ON hash (hash);
-CREATE TABLE sharing (package1 TEXT, package2 TEXT, func1 TEXT, func2 TEXT, files INTEGER, size INTEGER, FOREIGN KEY (package1) REFERENCES package(package) ON DELETE CASCADE, FOREIGN KEY (package2) REFERENCES package(package) ON DELETE CASCADE);
-CREATE INDEX sharing_insert_index ON sharing (package1, package2, func1, func2);
+CREATE TABLE sharing (pid1 INTEGER, pid2 INTEGER, func1 TEXT, func2 TEXT, files INTEGER, size INTEGER, FOREIGN KEY (pid1) REFERENCES package(id) ON DELETE CASCADE, FOREIGN KEY (pid2) REFERENCES package(id) ON DELETE CASCADE);
+CREATE INDEX sharing_insert_index ON sharing (pid1, pid2, func1, func2);
CREATE TABLE duplicate (cid INTEGER PRIMARY KEY, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE);
diff --git a/update_sharing.py b/update_sharing.py
index d2b357b..55e8096 100755
--- a/update_sharing.py
+++ b/update_sharing.py
@@ -5,27 +5,27 @@ import sqlite3
from dedup.utils import fetchiter
def add_values(cursor, insert_key, files, size):
- cursor.execute("UPDATE sharing SET files = files + ?, size = size + ? WHERE package1 = ? AND package2 = ? AND func1 = ? AND func2 = ?;",
+ cursor.execute("UPDATE sharing SET files = files + ?, size = size + ? WHERE pid1 = ? AND pid2 = ? AND func1 = ? AND func2 = ?;",
(files, size) + insert_key)
if cursor.rowcount > 0:
return
- cursor.execute("INSERT INTO sharing (package1, package2, func1, func2, files, size) VALUES (?, ?, ?, ?, ?, ?);",
+ cursor.execute("INSERT INTO sharing (pid1, pid2, func1, func2, files, size) VALUES (?, ?, ?, ?, ?, ?);",
insert_key + (files, size))
def compute_pkgdict(rows):
pkgdict = dict()
- for package, _, filename, size, function in rows:
- funcdict = pkgdict.setdefault(package, {})
+ for pid, _, filename, size, function in rows:
+ funcdict = pkgdict.setdefault(pid, {})
funcdict.setdefault(function, []).append((size, filename))
return pkgdict
def process_pkgdict(cursor, pkgdict):
- for package1, funcdict1 in pkgdict.items():
+ for pid1, funcdict1 in pkgdict.items():
for function1, files in funcdict1.items():
numfiles = len(files)
size = sum(entry[0] for entry in files)
- for package2, funcdict2 in pkgdict.items():
- if package1 == package2:
+ for pid2, funcdict2 in pkgdict.items():
+ if pid1 == pid2:
pkgnumfiles = numfiles - 1
pkgsize = size - min(entry[0] for entry in files)
if pkgnumfiles == 0:
@@ -34,7 +34,7 @@ def process_pkgdict(cursor, pkgdict):
pkgnumfiles = numfiles
pkgsize = size
for function2 in funcdict2.keys():
- insert_key = (package1, package2, function1, function2)
+ insert_key = (pid1, pid2, function1, function2)
add_values(cursor, insert_key, pkgnumfiles, pkgsize)
def main():
@@ -46,7 +46,7 @@ def main():
readcur = db.cursor()
readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;")
for hashvalue, in fetchiter(readcur):
- cur.execute("SELECT content.package, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;",
+ cur.execute("SELECT content.pid, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;",
(hashvalue,))
rows = cur.fetchall()
print("processing hash %s with %d entries" % (hashvalue, len(rows)))
diff --git a/webapp.py b/webapp.py
index 86d14f0..9e23128 100755
--- a/webapp.py
+++ b/webapp.py
@@ -248,67 +248,68 @@ class Application(object):
def get_details(self, package):
cur = self.db.cursor()
- cur.execute("SELECT version, architecture FROM package WHERE package = ?;",
+ cur.execute("SELECT id, version, architecture FROM package WHERE name = ?;",
(package,))
row = cur.fetchone()
if not row:
raise NotFound()
- version, architecture = row
- details = dict(package=package,
+ pid, version, architecture = row
+ details = dict(pid=pid,
+ package=package,
version=version,
architecture=architecture)
- cur.execute("SELECT count(filename), sum(size) FROM content WHERE package = ?;",
- (package,))
+ cur.execute("SELECT count(filename), sum(size) FROM content WHERE pid = ?;",
+ (pid,))
num_files, total_size = cur.fetchone()
if total_size is None:
total_size = 0
details.update(dict(num_files=num_files, total_size=total_size))
return details
- def get_dependencies(self, package):
+ def get_dependencies(self, pid):
cur = self.db.cursor()
- cur.execute("SELECT required FROM dependency WHERE package = ?;",
- (package,))
+ cur.execute("SELECT required FROM dependency WHERE pid = ?;",
+ (pid,))
return set(row[0] for row in fetchiter(cur))
- def cached_sharedstats(self, package):
+ def cached_sharedstats(self, pid):
cur = self.db.cursor()
sharedstats = {}
- cur.execute("SELECT package2, func1, func2, files, size FROM sharing WHERE package1 = ?;",
- (package,))
- for package2, func1, func2, files, size in fetchiter(cur):
+ cur.execute("SELECT pid2, package.name, func1, func2, files, size FROM sharing JOIN package ON sharing.pid2 = package.id WHERE pid1 = ?;",
+ (pid,))
+ for pid2, package2, func1, func2, files, size in fetchiter(cur):
if (func1, func2) not in hash_functions:
continue
curstats = sharedstats.setdefault(
function_combination(func1, func2), list())
- if package2 == package:
+ if pid2 == pid:
package2 = None
curstats.append(dict(package=package2, duplicate=files, savable=size))
return sharedstats
def show_package(self, package):
params = self.get_details(package)
- params["dependencies"] = self.get_dependencies(package)
- params["shared"] = self.cached_sharedstats(package)
+ params["dependencies"] = self.get_dependencies(params["pid"])
+ params["shared"] = self.cached_sharedstats(params["pid"])
params["urlroot"] = ".."
return html_response(package_template.render(params))
- def compute_comparison(self, package1, package2):
+ def compute_comparison(self, pid1, pid2):
"""Compute a sequence of comparison objects ordery by the size of the
object in the first package. Each element of the sequence is a dict
defining the following keys:
- * filenames: A set of filenames in package1 all referring to the
- same object.
+ * filenames: A set of filenames in package 1 (pid1) all referring to
+ the same object.
* size: Size of the object in bytes.
- * matches: A mapping from filenames in package2 to a mapping from
- hash function pairs to hash values.
+ * matches: A mapping from filenames in package 2 (pid2) to a mapping
+ from hash function pairs to hash values.
"""
cur = self.db.cursor()
- cur.execute("SELECT id, filename, size, hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid WHERE package = ? AND function = 'sha512' ORDER BY size DESC;",
- (package1,))
+ cur.execute("SELECT id, filename, size, hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid WHERE pid = ? AND function = 'sha512' ORDER BY size DESC;",
+ (pid1,))
cursize = -1
files = dict()
- minmatch = 2 if package1 == package2 else 1
+ minmatch = 2 if pid1 == pid2 else 1
for cid, filename, size, hashvalue in fetchiter(cur):
if cursize != size:
for entry in files.values():
@@ -325,8 +326,8 @@ class Application(object):
files[hashvalue] = entry
cur2 = self.db.cursor()
- cur2.execute("SELECT ha.function, ha.hash, hb.function, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id WHERE ha.cid = ? AND package = ?;",
- (cid, package2))
+ cur2.execute("SELECT ha.function, ha.hash, hb.function, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id WHERE ha.cid = ? AND pid = ?;",
+ (cid, pid2))
for func1, hashvalue, func2, filename in fetchiter(cur2):
entry["matches"].setdefault(filename, {})[func1, func2] = \
hashvalue
@@ -342,7 +343,7 @@ class Application(object):
if package1 != package2:
details2 = self.get_details(package2)
- shared = self.compute_comparison(package1, package2)
+ shared = self.compute_comparison(details1["pid"], details2["pid"])
params = dict(
details1=details1,
details2=details2,
@@ -352,7 +353,7 @@ class Application(object):
def show_hash(self, function, hashvalue):
cur = self.db.cursor()
- cur.execute("SELECT content.package, content.filename, content.size, hash.function FROM content JOIN hash ON content.id = hash.cid WHERE hash = ?;",
+ cur.execute("SELECT package.name, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id WHERE hash = ?;",
(hashvalue,))
entries = [dict(package=package, filename=filename, size=size,
function=otherfunc)
@@ -366,12 +367,12 @@ class Application(object):
def show_source(self, package):
cur = self.db.cursor()
- cur.execute("SELECT package FROM package WHERE source = ?;",
+ cur.execute("SELECT name FROM package WHERE source = ?;",
(package,))
binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur))
if not binpkgs:
raise NotFound
- cur.execute("SELECT package.package, sharing.package2, sharing.func1, sharing.func2, sharing.files, sharing.size FROM package JOIN sharing ON package.package = sharing.package1 WHERE package.source = ?;",
+ cur.execute("SELECT p1.name, p2.name, sharing.func1, sharing.func2, sharing.files, sharing.size FROM sharing JOIN package AS p1 ON sharing.pid1 = p1.id JOIN package AS p2 ON sharing.pid2 = p2.id WHERE p1.source = ?;",
(package,))
for binary, otherbin, func1, func2, files, size in fetchiter(cur):
entry = dict(package=otherbin,