Merge branch master into sqlalchemy

This basically pulls the packageid branch into sqlalchemy. The merge was complex, because many sql statements diverged. The merge brings us one step closer to supporting postgres, because an "INSERT OR REPLACE" was removed from readyaml.py in the packageid branch. Conflicts: update_sharing.py webapp.py
author: Helmut Grohne <helmut@subdivi.de> 2013-07-17 16:27:08 +0200
committer: Helmut Grohne <helmut@subdivi.de> 2013-07-17 16:27:08 +0200
commit: ed3e611cfc54b8c916e919701070bfd5c6770610 (patch)
tree: fe06694d5a2212c87a0d149eccb0f4cbb889a5cd /README
parent: a03daac99a237babcd874748d19fc0f809a1dc60 (diff)
parent: 6205c89b1e289f04dcea1e6e32fafa6357abf063 (diff)
download: debian-dedup-ed3e611cfc54b8c916e919701070bfd5c6770610.tar.gz
1 files changed, 8 insertions, 4 deletions
diff --git a/README b/README
index d4c950c..c3ffcb3 100644
--- a/README
+++ b/README
@@ -27,6 +27,10 @@ Import a full mirror (only http supported):
 
     ./autoimport.py -n -p http://your.mirror.example/debian
 
+After changing the database, a few tables caching expensive computations need
+to be (re)generated. Execute `./update_sharing.py`. Without this step the web
+interface will report wrong results.
+
 Viewing the results
 -------------------
 Run `./webapp.py` and enjoy a webinterface at `0.0.0.0:8800` or inspect the
@@ -34,17 +38,17 @@ SQL database by hand. Here are some example queries.
 
 Finding the 100 largest files shared with multiple packages.
 
-    SELECT a.package, a.filename, b.package, b.filename, a.size FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hash = hb.hash JOIN content AS b ON b.id = hb.cid WHERE (a.package != b.package OR a.filename != b.filename) ORDER BY a.size DESC LIMIT 100;
+    SELECT pa.name, a.filename, pb.name, b.filename, a.size FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hash = hb.hash JOIN content AS b ON b.id = hb.cid JOIN package AS pa ON b.pid = pa.id JOIN package AS pb ON b.pid = pb.id WHERE (a.pid != b.pid OR a.filename != b.filename) ORDER BY a.size DESC LIMIT 100;
 
 Finding those top 100 files that save most space when being reduced to only
 one copy in the archive.
 
-    SELECT hash, sum(size)-min(size), count(*), count(distinct package) FROM content JOIN hash ON content.id = hash.cid WHERE hash.function = "sha512" GROUP BY hash ORDER BY sum(size)-min(size) DESC LIMIT 100;
+    SELECT hash, sum(size)-min(size), count(*), count(distinct pid) FROM content JOIN hash ON content.id = hash.cid WHERE hash.function = "sha512" GROUP BY hash ORDER BY sum(size)-min(size) DESC LIMIT 100;
 
 Finding PNG images that do not carry a .png file extension.
 
-    SELECT package, filename, size FROM content JOIN hash ON content.id = hash.cid WHERE function = "image_sha512" AND filename NOT LIKE "%.png";
+    SELECT package.name, content.filename, content.size FROM content JOIN hash ON content.id = hash.cid JOIN package ON content.pid = package.id WHERE function = "image_sha512" AND filename NOT LIKE "%.png";
 
 Finding .gz files which either are not gziped or contain errors.
 
-    SELECT content.package, content.filename FROM content WHERE filename LIKE "%.gz" AND (SELECT count(*) FROM hash WHERE hash.cid = content.id AND hash.function = "gzip_sha512") = 0;
+    SELECT package.name, content.filename FROM content JOIN package ON content.pid = package.id WHERE filename LIKE "%.gz" AND (SELECT count(*) FROM hash WHERE hash.cid = content.id AND hash.function = "gzip_sha512") = 0;
author	Helmut Grohne <helmut@subdivi.de>	2013-07-17 16:27:08 +0200
committer	Helmut Grohne <helmut@subdivi.de>	2013-07-17 16:27:08 +0200
commit	ed3e611cfc54b8c916e919701070bfd5c6770610 (patch)
tree	fe06694d5a2212c87a0d149eccb0f4cbb889a5cd /README
parent	a03daac99a237babcd874748d19fc0f809a1dc60 (diff)
parent	6205c89b1e289f04dcea1e6e32fafa6357abf063 (diff)
download	debian-dedup-ed3e611cfc54b8c916e919701070bfd5c6770610.tar.gz