diff options
-rw-r--r-- | README | 4 | ||||
-rw-r--r-- | schema.sql | 1 | ||||
-rwxr-xr-x | update_sharing.py | 3 | ||||
-rwxr-xr-x | webapp.py | 36 |
4 files changed, 20 insertions, 24 deletions
@@ -38,7 +38,7 @@ SQL database by hand. Here are some example queries. Finding the 100 largest files shared with multiple packages. - SELECT pa.name, a.filename, pb.name, b.filename, a.size FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hash = hb.hash JOIN content AS b ON b.id = hb.cid JOIN package AS pa ON b.pid = pa.id JOIN package AS pb ON b.pid = pb.id WHERE (a.pid != b.pid OR a.filename != b.filename) ORDER BY a.size DESC LIMIT 100; + SELECT pa.name, a.filename, pb.name, b.filename, a.size FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hash = hb.hash JOIN content AS b ON b.id = hb.cid JOIN package AS pa ON a.pid = pa.id JOIN package AS pb ON b.pid = pb.id WHERE (a.pid != b.pid OR a.filename != b.filename) ORDER BY a.size DESC LIMIT 100; Finding those top 100 files that save most space when being reduced to only one copy in the archive. @@ -47,7 +47,7 @@ one copy in the archive. Finding PNG images that do not carry a .png file extension. - SELECT package.name, content.filename, content.size FROM content JOIN hash ON content.id = hash.cid JOIN package ON content.pid = package.id JOIN function ON hash.fid = function.id WHERE function.name = "image_sha512" AND filename NOT LIKE "%.png"; + SELECT package.name, content.filename, content.size FROM content JOIN hash ON content.id = hash.cid JOIN package ON content.pid = package.id JOIN function ON hash.fid = function.id WHERE function.name = "image_sha512" AND lower(filename) NOT LIKE "%.png"; Finding .gz files which either are not gziped or contain errors. @@ -17,3 +17,4 @@ CREATE TABLE sharing ( size INTEGER); CREATE INDEX sharing_insert_index ON sharing (pid1, pid2, fid1, fid2); CREATE TABLE duplicate (cid INTEGER PRIMARY KEY, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE); +CREATE TABLE issue (cid INTEGER REFERENCES content(id) ON DELETE CASCADE, issue TEXT); diff --git a/update_sharing.py b/update_sharing.py index e1a2d68..4669759 100755 --- a/update_sharing.py +++ b/update_sharing.py @@ -43,6 +43,7 @@ def main(): cur.execute("PRAGMA foreign_keys = ON;") cur.execute("DELETE FROM sharing;") cur.execute("DELETE FROM duplicate;") + cur.execute("DELETE FROM issue;") readcur = db.cursor() readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;") for hashvalue, in fetchiter(readcur): @@ -54,6 +55,8 @@ def main(): cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);", [(row[1],) for row in rows]) process_pkgdict(cur, pkgdict) + cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash WHERE hash.cid = content.id AND hash.function = 'gzip_sha512');") + cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid WHERE function = 'image_sha512' AND lower(filename) NOT LIKE '%.png';") db.commit() if __name__ == "__main__": @@ -69,6 +69,14 @@ package_template = jinjaenv.from_string( {%- endfor -%} <p>Note: Packages with yellow background are required to be installed when this package is installed.</p> {%- endif -%} +{%- if issues -%} + <h3>issues with particular files</h3> + <table border='1'><tr><th>filename</th><th>issue</th></tr> + {%- for filename, issue in issues|dictsort(true) -%} + <tr><td><span class="filename">{{ filename|e }}</span></td><td>{{ issue|e }}</td></tr> + {%- endfor -%} + </table> +{%- endif -%} {% endblock %}""") detail_template = jinjaenv.from_string( @@ -151,7 +159,7 @@ index_template = jinjaenv.from_string( <noscript><b>This form is disfunctional when javascript is not enabled</b></noscript> Enter binary package to inspect - Note: Non-existing packages will result in <b>404</b>-Errors <form id="pkg_form"> - <label for="pkg_name">Name: <input type="text" size="30" name="pkg_name" id="pkg_name"> + <label for="pkg_name">Name: </label><input type="text" size="30" name="pkg_name" id="pkg_name"> <input type="submit" value="Go"> Permanent Link: <a id="perma_link" href="#"></a> </form> </fieldset></div></li> @@ -193,27 +201,6 @@ def html_response(unicode_iterator, max_age=24 * 60 * 60): resp.expires = datetime.datetime.now() + datetime.timedelta(seconds=max_age) return resp -def generate_shared(rows): - """internal helper from show_detail""" - entry = None - for filename1, size1, func1, filename2, size2, func2, hashvalue in rows: - funccomb = (func1, func2) - if funccomb not in hash_functions: - continue - if entry and (entry["filename1"] != filename1 or - entry["filename2"] != filename2): - yield entry - entry = None - if entry: - funcdict = entry["functions"] - else: - funcdict = dict() - entry = dict(filename1=filename1, filename2=filename2, size1=size1, - size2=size2, functions=funcdict) - funcdict[funccomb] = hashvalue - if entry: - yield entry - class Application(object): def __init__(self, db): self.db = db @@ -292,6 +279,11 @@ class Application(object): params["dependencies"] = self.get_dependencies(params["pid"]) params["shared"] = self.cached_sharedstats(params["pid"]) params["urlroot"] = ".." + cur = self.db.cursor() + cur.execute("SELECT content.filename, issue.issue FROM content JOIN issue ON content.id = issue.cid WHERE content.pid = ?;", + (params["pid"],)) + params["issues"] = dict(cur.fetchall()) + cur.close() return html_response(package_template.render(params)) def compute_comparison(self, pid1, pid2): |