summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README4
-rw-r--r--schema.sql1
-rwxr-xr-xupdate_sharing.py3
-rwxr-xr-xwebapp.py36
4 files changed, 20 insertions, 24 deletions
diff --git a/README b/README
index b0e06f3..ef0ae48 100644
--- a/README
+++ b/README
@@ -38,7 +38,7 @@ SQL database by hand. Here are some example queries.
Finding the 100 largest files shared with multiple packages.
- SELECT pa.name, a.filename, pb.name, b.filename, a.size FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hash = hb.hash JOIN content AS b ON b.id = hb.cid JOIN package AS pa ON b.pid = pa.id JOIN package AS pb ON b.pid = pb.id WHERE (a.pid != b.pid OR a.filename != b.filename) ORDER BY a.size DESC LIMIT 100;
+ SELECT pa.name, a.filename, pb.name, b.filename, a.size FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hash = hb.hash JOIN content AS b ON b.id = hb.cid JOIN package AS pa ON a.pid = pa.id JOIN package AS pb ON b.pid = pb.id WHERE (a.pid != b.pid OR a.filename != b.filename) ORDER BY a.size DESC LIMIT 100;
Finding those top 100 files that save most space when being reduced to only
one copy in the archive.
@@ -47,7 +47,7 @@ one copy in the archive.
Finding PNG images that do not carry a .png file extension.
- SELECT package.name, content.filename, content.size FROM content JOIN hash ON content.id = hash.cid JOIN package ON content.pid = package.id JOIN function ON hash.fid = function.id WHERE function.name = "image_sha512" AND filename NOT LIKE "%.png";
+ SELECT package.name, content.filename, content.size FROM content JOIN hash ON content.id = hash.cid JOIN package ON content.pid = package.id JOIN function ON hash.fid = function.id WHERE function.name = "image_sha512" AND lower(filename) NOT LIKE "%.png";
Finding .gz files which either are not gziped or contain errors.
diff --git a/schema.sql b/schema.sql
index cb6a2c5..13a65aa 100644
--- a/schema.sql
+++ b/schema.sql
@@ -17,3 +17,4 @@ CREATE TABLE sharing (
size INTEGER);
CREATE INDEX sharing_insert_index ON sharing (pid1, pid2, fid1, fid2);
CREATE TABLE duplicate (cid INTEGER PRIMARY KEY, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE);
+CREATE TABLE issue (cid INTEGER REFERENCES content(id) ON DELETE CASCADE, issue TEXT);
diff --git a/update_sharing.py b/update_sharing.py
index e1a2d68..4669759 100755
--- a/update_sharing.py
+++ b/update_sharing.py
@@ -43,6 +43,7 @@ def main():
cur.execute("PRAGMA foreign_keys = ON;")
cur.execute("DELETE FROM sharing;")
cur.execute("DELETE FROM duplicate;")
+ cur.execute("DELETE FROM issue;")
readcur = db.cursor()
readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;")
for hashvalue, in fetchiter(readcur):
@@ -54,6 +55,8 @@ def main():
cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);",
[(row[1],) for row in rows])
process_pkgdict(cur, pkgdict)
+ cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash WHERE hash.cid = content.id AND hash.function = 'gzip_sha512');")
+ cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid WHERE function = 'image_sha512' AND lower(filename) NOT LIKE '%.png';")
db.commit()
if __name__ == "__main__":
diff --git a/webapp.py b/webapp.py
index d42e932..c080d41 100755
--- a/webapp.py
+++ b/webapp.py
@@ -69,6 +69,14 @@ package_template = jinjaenv.from_string(
{%- endfor -%}
<p>Note: Packages with yellow background are required to be installed when this package is installed.</p>
{%- endif -%}
+{%- if issues -%}
+ <h3>issues with particular files</h3>
+ <table border='1'><tr><th>filename</th><th>issue</th></tr>
+ {%- for filename, issue in issues|dictsort(true) -%}
+ <tr><td><span class="filename">{{ filename|e }}</span></td><td>{{ issue|e }}</td></tr>
+ {%- endfor -%}
+ </table>
+{%- endif -%}
{% endblock %}""")
detail_template = jinjaenv.from_string(
@@ -151,7 +159,7 @@ index_template = jinjaenv.from_string(
<noscript><b>This form is disfunctional when javascript is not enabled</b></noscript>
Enter binary package to inspect - Note: Non-existing packages will result in <b>404</b>-Errors
<form id="pkg_form">
- <label for="pkg_name">Name: <input type="text" size="30" name="pkg_name" id="pkg_name">
+ <label for="pkg_name">Name: </label><input type="text" size="30" name="pkg_name" id="pkg_name">
<input type="submit" value="Go"> Permanent Link: <a id="perma_link" href="#"></a>
</form>
</fieldset></div></li>
@@ -193,27 +201,6 @@ def html_response(unicode_iterator, max_age=24 * 60 * 60):
resp.expires = datetime.datetime.now() + datetime.timedelta(seconds=max_age)
return resp
-def generate_shared(rows):
- """internal helper from show_detail"""
- entry = None
- for filename1, size1, func1, filename2, size2, func2, hashvalue in rows:
- funccomb = (func1, func2)
- if funccomb not in hash_functions:
- continue
- if entry and (entry["filename1"] != filename1 or
- entry["filename2"] != filename2):
- yield entry
- entry = None
- if entry:
- funcdict = entry["functions"]
- else:
- funcdict = dict()
- entry = dict(filename1=filename1, filename2=filename2, size1=size1,
- size2=size2, functions=funcdict)
- funcdict[funccomb] = hashvalue
- if entry:
- yield entry
-
class Application(object):
def __init__(self, db):
self.db = db
@@ -292,6 +279,11 @@ class Application(object):
params["dependencies"] = self.get_dependencies(params["pid"])
params["shared"] = self.cached_sharedstats(params["pid"])
params["urlroot"] = ".."
+ cur = self.db.cursor()
+ cur.execute("SELECT content.filename, issue.issue FROM content JOIN issue ON content.id = issue.cid WHERE content.pid = ?;",
+ (params["pid"],))
+ params["issues"] = dict(cur.fetchall())
+ cur.close()
return html_response(package_template.render(params))
def compute_comparison(self, pid1, pid2):