summaryrefslogtreecommitdiff
path: root/update_sharing.py
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2014-03-08 12:39:32 +0100
committerHelmut Grohne <helmut@subdivi.de>2014-03-08 12:39:32 +0100
commitc6a30cefff55cd247a47fa0a2d4f819592e1202b (patch)
tree58b6ff52bc6827782c2973f1ce976e245ce5f34c /update_sharing.py
parent751f19ec1107c9059ae4834e4b757741ebee6cbd (diff)
parentbb0aea9971bc79d8787d8f034022d0ca803fcab3 (diff)
downloaddebian-dedup-c6a30cefff55cd247a47fa0a2d4f819592e1202b.tar.gz
Merge branch 'master' into sqlalchemy
In the mean time, the master branch evolved quite a bit and the schema changed again (eqclass added to function table). The main reason for the merge is to resolve the large amounts of conflicts once, so development of the sqlalchemy branch can continue and still benefit from changes in the master branch such as schema compatibility, adapting the indent level in web app due to the use of contextlib.closing which resembles sqlalchemy's "with db.begin() as conn:". Conflicts: autoimport.py dedup/utils.py readyaml.py update_sharing.py webapp.py
Diffstat (limited to 'update_sharing.py')
-rwxr-xr-xupdate_sharing.py42
1 files changed, 28 insertions, 14 deletions
diff --git a/update_sharing.py b/update_sharing.py
index 6fd83f8..450bfc7 100755
--- a/update_sharing.py
+++ b/update_sharing.py
@@ -1,5 +1,7 @@
#!/usr/bin/python
+import optparse
+
import sqlalchemy
from dedup.utils import fetchiter, enable_sqlite_foreign_keys
@@ -39,30 +41,42 @@ def process_pkgdict(conn, pkgdict):
insert_key = (pid1, pid2, fid1, fid2)
add_values(conn, insert_key, pkgnumfiles, pkgsize)
-def main():
- db = sqlalchemy.create_engine("sqlite:///test.sqlite3")
- enable_sqlite_foreign_keys(db)
+def main(db):
with db.begin() as conn:
conn.execute("DELETE FROM sharing;")
conn.execute("DELETE FROM duplicate;")
conn.execute("DELETE FROM issue;")
readcur = conn.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;")
for hashvalue, in fetchiter(readcur):
- rows = conn.execute(sqlalchemy.text("SELECT content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id WHERE hash = :hashvalue;"),
+ rows = conn.execute(sqlalchemy.text("SELECT function.eqclass, content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id JOIN function ON hash.fid = function.id AND function.eqclass IS NOT NULL WHERE hash = :hashvalue;"),
hashvalue=hashvalue).fetchall()
- print("processing hash %s with %d entries" % (hashvalue, len(rows)))
- pkgdict = compute_pkgdict(rows)
+ rowdict = dict()
for row in rows:
- cid = row[1]
- already = conn.scalar(sqlalchemy.text("SELECT cid FROM duplicate WHERE cid = :cid;"),
- cid=cid)
- if not already:
- conn.execute(sqlalchemy.text("INSERT INTO duplicate (cid) VALUES (:cid);"),
- cid=cid)
- process_pkgdict(conn, pkgdict)
+ rowdict.setdefault(row[0], []).append(row[1:])
+ for eqclass, rows in rowdict.items():
+ if len(rows) < 2:
+ print("skipping hash %s class %d with too few entries" % (hashvalue, eqclass))
+ continue
+ print("processing hash %s class %d with %d entries" % (hashvalue, eqclass, len(rows)))
+ pkgdict = compute_pkgdict(rows)
+ for row in rows:
+ cid = row[1]
+ already = conn.scalar(sqlalchemy.text("SELECT cid FROM duplicate WHERE cid = :cid;"),
+ cid=cid)
+ if not already:
+ conn.execute(sqlalchemy.text("INSERT INTO duplicate (cid) VALUES (:cid);"),
+ cid=cid)
+ process_pkgdict(conn, pkgdict)
conn.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');")
conn.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';")
conn.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';")
if __name__ == "__main__":
- main()
+ parser = optparse.OptionParser()
+ parser.add_option("-d", "--database", action="store",
+ default="sqlite:///test.sqlite3",
+ help="location of the database")
+ options, args = parser.parse_args()
+ db = sqlalchemy.create_engine(options.database)
+ enable_sqlite_foreign_keys(db)
+ main(db)