diff options
author | Helmut Grohne <helmut@subdivi.de> | 2014-03-08 12:39:32 +0100 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2014-03-08 12:39:32 +0100 |
commit | c6a30cefff55cd247a47fa0a2d4f819592e1202b (patch) | |
tree | 58b6ff52bc6827782c2973f1ce976e245ce5f34c /update_sharing.py | |
parent | 751f19ec1107c9059ae4834e4b757741ebee6cbd (diff) | |
parent | bb0aea9971bc79d8787d8f034022d0ca803fcab3 (diff) | |
download | debian-dedup-c6a30cefff55cd247a47fa0a2d4f819592e1202b.tar.gz |
Merge branch 'master' into sqlalchemy
In the mean time, the master branch evolved quite a bit and the schema
changed again (eqclass added to function table). The main reason for the
merge is to resolve the large amounts of conflicts once, so development
of the sqlalchemy branch can continue and still benefit from changes in
the master branch such as schema compatibility, adapting the indent
level in web app due to the use of contextlib.closing which resembles
sqlalchemy's "with db.begin() as conn:".
Conflicts:
autoimport.py
dedup/utils.py
readyaml.py
update_sharing.py
webapp.py
Diffstat (limited to 'update_sharing.py')
-rwxr-xr-x | update_sharing.py | 42 |
1 files changed, 28 insertions, 14 deletions
diff --git a/update_sharing.py b/update_sharing.py index 6fd83f8..450bfc7 100755 --- a/update_sharing.py +++ b/update_sharing.py @@ -1,5 +1,7 @@ #!/usr/bin/python +import optparse + import sqlalchemy from dedup.utils import fetchiter, enable_sqlite_foreign_keys @@ -39,30 +41,42 @@ def process_pkgdict(conn, pkgdict): insert_key = (pid1, pid2, fid1, fid2) add_values(conn, insert_key, pkgnumfiles, pkgsize) -def main(): - db = sqlalchemy.create_engine("sqlite:///test.sqlite3") - enable_sqlite_foreign_keys(db) +def main(db): with db.begin() as conn: conn.execute("DELETE FROM sharing;") conn.execute("DELETE FROM duplicate;") conn.execute("DELETE FROM issue;") readcur = conn.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;") for hashvalue, in fetchiter(readcur): - rows = conn.execute(sqlalchemy.text("SELECT content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id WHERE hash = :hashvalue;"), + rows = conn.execute(sqlalchemy.text("SELECT function.eqclass, content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id JOIN function ON hash.fid = function.id AND function.eqclass IS NOT NULL WHERE hash = :hashvalue;"), hashvalue=hashvalue).fetchall() - print("processing hash %s with %d entries" % (hashvalue, len(rows))) - pkgdict = compute_pkgdict(rows) + rowdict = dict() for row in rows: - cid = row[1] - already = conn.scalar(sqlalchemy.text("SELECT cid FROM duplicate WHERE cid = :cid;"), - cid=cid) - if not already: - conn.execute(sqlalchemy.text("INSERT INTO duplicate (cid) VALUES (:cid);"), - cid=cid) - process_pkgdict(conn, pkgdict) + rowdict.setdefault(row[0], []).append(row[1:]) + for eqclass, rows in rowdict.items(): + if len(rows) < 2: + print("skipping hash %s class %d with too few entries" % (hashvalue, eqclass)) + continue + print("processing hash %s class %d with %d entries" % (hashvalue, eqclass, len(rows))) + pkgdict = compute_pkgdict(rows) + for row in rows: + cid = row[1] + already = conn.scalar(sqlalchemy.text("SELECT cid FROM duplicate WHERE cid = :cid;"), + cid=cid) + if not already: + conn.execute(sqlalchemy.text("INSERT INTO duplicate (cid) VALUES (:cid);"), + cid=cid) + process_pkgdict(conn, pkgdict) conn.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');") conn.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';") conn.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';") if __name__ == "__main__": - main() + parser = optparse.OptionParser() + parser.add_option("-d", "--database", action="store", + default="sqlite:///test.sqlite3", + help="location of the database") + options, args = parser.parse_args() + db = sqlalchemy.create_engine(options.database) + enable_sqlite_foreign_keys(db) + main(db) |