diff options
author | Helmut Grohne <helmut@subdivi.de> | 2013-08-02 08:40:49 +0200 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2013-08-02 08:40:49 +0200 |
commit | cb3708825bf7ea32314040575cef35980dad0cd8 (patch) | |
tree | 31575a8525dc90ba6904268d94f47e1604bf0557 /README | |
parent | a4bbbb6e664e605634cb3f9e0564c7e4a93697be (diff) | |
parent | 2712edb550968ce7ec8cd9800241d7944666631a (diff) | |
download | debian-dedup-cb3708825bf7ea32314040575cef35980dad0cd8.tar.gz |
Merge branch master into sqlalchemy
This makes the sqlalchemy branch schema-compatible with master again.
The biggest change on master was the introduction of the function table.
It caused most of the conflicts. Note that webapp had one conflict not
detected by git: The selecting of issues in show_package needed
sqlalchemy conversion.
Conflicts:
README
update_sharing.py
webapp.py
Diffstat (limited to 'README')
-rw-r--r-- | README | 10 |
1 files changed, 5 insertions, 5 deletions
@@ -1,7 +1,7 @@ Required packages ----------------- - aptitude install python python-debian python-lzma curl python-jinja2 python-werkzeug sqlite3 python-imaging python-yaml python-concurrent.futures python-sqlalchemy + aptitude install python python-debian python-lzma curl python-jinja2 python-werkzeug sqlite3 python-imaging python-yaml python-concurrent.futures python-pkg-resources python-sqlalchemy Create a database ----------------- @@ -38,17 +38,17 @@ SQL database by hand. Here are some example queries. Finding the 100 largest files shared with multiple packages. - SELECT pa.name, a.filename, pb.name, b.filename, a.size FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hash = hb.hash JOIN content AS b ON b.id = hb.cid JOIN package AS pa ON b.pid = pa.id JOIN package AS pb ON b.pid = pb.id WHERE (a.pid != b.pid OR a.filename != b.filename) ORDER BY a.size DESC LIMIT 100; + SELECT pa.name, a.filename, pb.name, b.filename, a.size FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hash = hb.hash JOIN content AS b ON b.id = hb.cid JOIN package AS pa ON a.pid = pa.id JOIN package AS pb ON b.pid = pb.id WHERE (a.pid != b.pid OR a.filename != b.filename) ORDER BY a.size DESC LIMIT 100; Finding those top 100 files that save most space when being reduced to only one copy in the archive. - SELECT hash, sum(size)-min(size), count(*), count(distinct pid) FROM content JOIN hash ON content.id = hash.cid WHERE hash.function = "sha512" GROUP BY hash ORDER BY sum(size)-min(size) DESC LIMIT 100; + SELECT hash, sum(size)-min(size), count(*), count(distinct pid) FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = "sha512" GROUP BY hash ORDER BY sum(size)-min(size) DESC LIMIT 100; Finding PNG images that do not carry a .png file extension. - SELECT package.name, content.filename, content.size FROM content JOIN hash ON content.id = hash.cid JOIN package ON content.pid = package.id WHERE function = "image_sha512" AND filename NOT LIKE "%.png"; + SELECT package.name, content.filename, content.size FROM content JOIN hash ON content.id = hash.cid JOIN package ON content.pid = package.id JOIN function ON hash.fid = function.id WHERE function.name = "png_sha512" AND lower(filename) NOT LIKE "%.png"; Finding .gz files which either are not gziped or contain errors. - SELECT package.name, content.filename FROM content JOIN package ON content.pid = package.id WHERE filename LIKE "%.gz" AND (SELECT count(*) FROM hash WHERE hash.cid = content.id AND hash.function = "gzip_sha512") = 0; + SELECT package.name, content.filename FROM content JOIN package ON content.pid = package.id WHERE filename LIKE "%.gz" AND (SELECT count(*) FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = "gzip_sha512") = 0; |