summaryrefslogtreecommitdiff
path: root/README
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2013-07-10 16:16:45 +0200
committerHelmut Grohne <helmut@subdivi.de>2013-07-10 16:16:45 +0200
commit14020d53fd4853aa72f159885edad1dcbfce2ee7 (patch)
tree3d1182588e2a9f73941f88dd80bd4b10b04aca53 /README
parentada4f94466bf3eddc192cf22c8ecefc9cd5f0ea3 (diff)
downloaddebian-dedup-14020d53fd4853aa72f159885edad1dcbfce2ee7.tar.gz
schema: reference package table by integer key
One approach to improve performance is to reduce the database size. A package name takes up 15 bytes in average. A number of a package takes up two bytes. Multiply that difference with the number of references and it should be noticeably. A small test set show a reduction by 10%.
Diffstat (limited to 'README')
-rw-r--r--README8
1 files changed, 4 insertions, 4 deletions
diff --git a/README b/README
index 55659dc..44b086a 100644
--- a/README
+++ b/README
@@ -38,17 +38,17 @@ SQL database by hand. Here are some example queries.
Finding the 100 largest files shared with multiple packages.
- SELECT a.package, a.filename, b.package, b.filename, a.size FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hash = hb.hash JOIN content AS b ON b.id = hb.cid WHERE (a.package != b.package OR a.filename != b.filename) ORDER BY a.size DESC LIMIT 100;
+ SELECT pa.name, a.filename, pb.name, b.filename, a.size FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hash = hb.hash JOIN content AS b ON b.id = hb.cid JOIN package AS pa ON b.pid = pa.id JOIN package AS pb ON b.pid = pb.id WHERE (a.pid != b.pid OR a.filename != b.filename) ORDER BY a.size DESC LIMIT 100;
Finding those top 100 files that save most space when being reduced to only
one copy in the archive.
- SELECT hash, sum(size)-min(size), count(*), count(distinct package) FROM content JOIN hash ON content.id = hash.cid WHERE hash.function = "sha512" GROUP BY hash ORDER BY sum(size)-min(size) DESC LIMIT 100;
+ SELECT hash, sum(size)-min(size), count(*), count(distinct pid) FROM content JOIN hash ON content.id = hash.cid WHERE hash.function = "sha512" GROUP BY hash ORDER BY sum(size)-min(size) DESC LIMIT 100;
Finding PNG images that do not carry a .png file extension.
- SELECT package, filename, size FROM content JOIN hash ON content.id = hash.cid WHERE function = "image_sha512" AND filename NOT LIKE "%.png";
+ SELECT package.name, content.filename, content.size FROM content JOIN hash ON content.id = hash.cid JOIN package ON content.pid = package.id WHERE function = "image_sha512" AND filename NOT LIKE "%.png";
Finding .gz files which either are not gziped or contain errors.
- SELECT content.package, content.filename FROM content WHERE filename LIKE "%.gz" AND (SELECT count(*) FROM hash WHERE hash.cid = content.id AND hash.function = "gzip_sha512") = 0;
+ SELECT package.name, content.filename FROM content JOIN package ON content.pid = package.id WHERE filename LIKE "%.gz" AND (SELECT count(*) FROM hash WHERE hash.cid = content.id AND hash.function = "gzip_sha512") = 0;