summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README4
-rwxr-xr-xreadyaml.py13
-rw-r--r--schema.sql7
-rwxr-xr-xupdate_sharing.py10
-rwxr-xr-xwebapp.py6
5 files changed, 25 insertions, 15 deletions
diff --git a/README b/README
index 44b086a..ff96fae 100644
--- a/README
+++ b/README
@@ -38,12 +38,12 @@ SQL database by hand. Here are some example queries.
Finding the 100 largest files shared with multiple packages.
- SELECT pa.name, a.filename, pb.name, b.filename, a.size FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hash = hb.hash JOIN content AS b ON b.id = hb.cid JOIN package AS pa ON b.pid = pa.id JOIN package AS pb ON b.pid = pb.id WHERE (a.pid != b.pid OR a.filename != b.filename) ORDER BY a.size DESC LIMIT 100;
+ SELECT pa.name, a.filename, pb.name, b.filename, a.size FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hid = hb.hid JOIN content AS b ON b.id = hb.cid JOIN package AS pa ON a.pid = pa.id JOIN package AS pb ON b.pid = pb.id WHERE (a.pid != b.pid OR a.filename != b.filename) ORDER BY a.size DESC LIMIT 100;
Finding those top 100 files that save most space when being reduced to only
one copy in the archive.
- SELECT hash, sum(size)-min(size), count(*), count(distinct pid) FROM content JOIN hash ON content.id = hash.cid WHERE hash.function = "sha512" GROUP BY hash ORDER BY sum(size)-min(size) DESC LIMIT 100;
+ SELECT hashvalue.hash, sum(size)-min(size), count(*), count(distinct pid) FROM content JOIN hash ON content.id = hash.cid JOIN hashvalue ON hash.hid = hashvalue.id WHERE hash.function = "sha512" GROUP BY hash.hid ORDER BY sum(size)-min(size) DESC LIMIT 100;
Finding PNG images that do not carry a .png file extension.
diff --git a/readyaml.py b/readyaml.py
index bb8ac54..007ed96 100755
--- a/readyaml.py
+++ b/readyaml.py
@@ -45,9 +45,16 @@ def readyaml(db, stream):
cur.execute("INSERT INTO content (pid, filename, size) VALUES (?, ?, ?);",
(pid, entry["name"], entry["size"]))
cid = cur.lastrowid
- cur.executemany("INSERT INTO hash (cid, function, hash) VALUES (?, ?, ?);",
- ((cid, func, hexhash)
- for func, hexhash in entry["hashes"].items()))
+ for func, hexhash in entry["hashes"].items():
+ cur.execute("SELECT id FROM hashvalue WHERE hash = ?;", (hexhash,))
+ row = cur.fetchone()
+ if row:
+ hid = row[0]
+ else:
+ cur.execute("INSERT INTO hashvalue (hash) VALUES (?);", (hexhash,))
+ hid = cur.lastrowid
+ cur.execute("INSERT INTO hash (cid, function, hid) VALUES (?, ?, ?);",
+ (cid, func, hid))
raise ValueError("missing commit block")
def main():
diff --git a/schema.sql b/schema.sql
index e9e0bcc..1e33bd0 100644
--- a/schema.sql
+++ b/schema.sql
@@ -1,10 +1,13 @@
CREATE TABLE package (id INTEGER PRIMARY KEY, name TEXT UNIQUE, version TEXT, architecture TEXT, source TEXT);
CREATE TABLE content (id INTEGER PRIMARY KEY, pid INTEGER, filename TEXT, size INTEGER, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE);
-CREATE TABLE hash (cid INTEGER, function TEXT, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE);
+CREATE TABLE hashvalue (id INTEGER PRIMARY KEY, hash TEXT UNIQUE NOT NULL);
+CREATE TABLE hash (cid INTEGER, function TEXT, hid INTEGER, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE, FOREIGN KEY (hid) REFERENCES hashvalue(id));
+CREATE TRIGGER hashvalue_cleanup_trigger AFTER DELETE ON hash FOR EACH ROW BEGIN
+ DELETE FROM hashvalue WHERE hashvalue.id = old.hid AND NOT EXISTS (SELECT hid FROM hash WHERE hash.hid = old.hid); END;
CREATE TABLE dependency (pid INTEGER, required TEXT, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE);
CREATE INDEX content_package_size_index ON content (pid, size);
CREATE INDEX hash_cid_index ON hash (cid);
-CREATE INDEX hash_hash_index ON hash (hash);
+CREATE INDEX hash_hid_index ON hash (hid);
CREATE TABLE sharing (pid1 INTEGER, pid2 INTEGER, func1 TEXT, func2 TEXT, files INTEGER, size INTEGER, FOREIGN KEY (pid1) REFERENCES package(id) ON DELETE CASCADE, FOREIGN KEY (pid2) REFERENCES package(id) ON DELETE CASCADE);
CREATE INDEX sharing_insert_index ON sharing (pid1, pid2, func1, func2);
diff --git a/update_sharing.py b/update_sharing.py
index 55e8096..bca0213 100755
--- a/update_sharing.py
+++ b/update_sharing.py
@@ -44,12 +44,12 @@ def main():
cur.execute("DELETE FROM sharing;")
cur.execute("DELETE FROM duplicate;")
readcur = db.cursor()
- readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;")
- for hashvalue, in fetchiter(readcur):
- cur.execute("SELECT content.pid, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;",
- (hashvalue,))
+ readcur.execute("SELECT hid FROM hash GROUP BY hid HAVING count(*) > 1;")
+ for hid, in fetchiter(readcur):
+ cur.execute("SELECT content.pid, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hid = ?;",
+ (hid,))
rows = cur.fetchall()
- print("processing hash %s with %d entries" % (hashvalue, len(rows)))
+ print("processing hash %d with %d entries" % (hid, len(rows)))
pkgdict = compute_pkgdict(rows)
cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);",
[(row[1],) for row in rows])
diff --git a/webapp.py b/webapp.py
index 9e23128..bcea2f9 100755
--- a/webapp.py
+++ b/webapp.py
@@ -305,7 +305,7 @@ class Application(object):
from hash function pairs to hash values.
"""
cur = self.db.cursor()
- cur.execute("SELECT id, filename, size, hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid WHERE pid = ? AND function = 'sha512' ORDER BY size DESC;",
+ cur.execute("SELECT content.id, content.filename, content.size, hashvalue.hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid JOIN hashvalue ON hash.hid = hashvalue.id WHERE pid = ? AND function = 'sha512' ORDER BY size DESC;",
(pid1,))
cursize = -1
files = dict()
@@ -326,7 +326,7 @@ class Application(object):
files[hashvalue] = entry
cur2 = self.db.cursor()
- cur2.execute("SELECT ha.function, ha.hash, hb.function, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id WHERE ha.cid = ? AND pid = ?;",
+ cur2.execute("SELECT ha.function, hashvalue.hash, hb.function, filename FROM hash AS ha JOIN hash AS hb ON ha.hid = hb.hid JOIN content ON hb.cid = content.id JOIN hashvalue ON ha.hid = hashvalue.id WHERE ha.cid = ? AND content.pid = ?;",
(cid, pid2))
for func1, hashvalue, func2, filename in fetchiter(cur2):
entry["matches"].setdefault(filename, {})[func1, func2] = \
@@ -353,7 +353,7 @@ class Application(object):
def show_hash(self, function, hashvalue):
cur = self.db.cursor()
- cur.execute("SELECT package.name, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id WHERE hash = ?;",
+ cur.execute("SELECT package.name, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN hashvalue ON hash.hid = hashvalue.id WHERE hashvalue.hash = ?;",
(hashvalue,))
entries = [dict(package=package, filename=filename, size=size,
function=otherfunc)