4 files changed, 121 insertions, 112 deletions
diff --git a/README b/README
index 44b086a..c3ffcb3 100644
--- a/README
+++ b/README
@@ -1,7 +1,7 @@
 Required packages
 -----------------
 
-    aptitude install python python-debian python-lzma curl python-jinja2 python-werkzeug sqlite3 python-imaging python-yaml python-concurrent.futures
+    aptitude install python python-debian python-lzma curl python-jinja2 python-werkzeug sqlite3 python-imaging python-yaml python-concurrent.futures python-sqlalchemy
 
 Create a database
 -----------------
diff --git a/dedup/utils.py b/dedup/utils.py
index 2fae9fd..e4d1c10 100644
--- a/dedup/utils.py
+++ b/dedup/utils.py
@@ -1,3 +1,5 @@
+import sqlalchemy.event
+
 def fetchiter(cursor):
     rows = cursor.fetchmany()
     while rows:
@@ -5,3 +7,8 @@ def fetchiter(cursor):
             yield row
         rows = cursor.fetchmany()
 
+def enable_sqlite_foreign_keys(engine):
+    @sqlalchemy.event.listens_for(engine, "connect")
+    def pragma_foreign_keys(connection, _):
+        connection.execute("PRAGMA foreign_keys=ON;")
+
diff --git a/update_sharing.py b/update_sharing.py
index 55e8096..664b627 100755
--- a/update_sharing.py
+++ b/update_sharing.py
@@ -1,16 +1,18 @@
 #!/usr/bin/python
 
-import sqlite3
+import sqlalchemy
 
-from dedup.utils import fetchiter
+from dedup.utils import fetchiter, enable_sqlite_foreign_keys
 
-def add_values(cursor, insert_key, files, size):
-    cursor.execute("UPDATE sharing SET files = files + ?, size = size + ? WHERE pid1 = ? AND pid2 = ? AND func1 = ? AND func2 = ?;",
-                   (files, size) + insert_key)
-    if cursor.rowcount > 0:
+def add_values(conn, insert_key, files, size):
+    params = dict(files=files, size=size, pid1=insert_key[0],
+                  pid2=insert_key[1], func1=insert_key[2], func2=insert_key[3])
+    rows = conn.execute("UPDATE sharing SET files = files + :files, size = size + :size WHERE pid1 = :pid1 AND pid2 = :pid2 AND func1 = :func1 AND func2 = :func2;",
+                        **params)
+    if rows.rowcount > 0:
         return
-    cursor.execute("INSERT INTO sharing (pid1, pid2, func1, func2, files, size) VALUES (?, ?, ?, ?, ?, ?);",
-                   insert_key + (files, size))
+    conn.execute("INSERT INTO sharing (pid1, pid2, func1, func2, files, size) VALUES (:pid1, :pid2, :func1, :func2, :files, :size);",
+                 **params)
 
 def compute_pkgdict(rows):
     pkgdict = dict()
@@ -19,7 +21,7 @@ def compute_pkgdict(rows):
         funcdict.setdefault(function, []).append((size, filename))
     return pkgdict
 
-def process_pkgdict(cursor, pkgdict):
+def process_pkgdict(conn, pkgdict):
     for pid1, funcdict1 in pkgdict.items():
         for function1, files in funcdict1.items():
             numfiles = len(files)
@@ -35,26 +37,28 @@ def process_pkgdict(cursor, pkgdict):
                     pkgsize = size
                 for function2 in funcdict2.keys():
                     insert_key = (pid1, pid2, function1, function2)
-                    add_values(cursor, insert_key, pkgnumfiles, pkgsize)
+                    add_values(conn, insert_key, pkgnumfiles, pkgsize)
 
 def main():
-    db = sqlite3.connect("test.sqlite3")
-    cur = db.cursor()
-    cur.execute("PRAGMA foreign_keys = ON;")
-    cur.execute("DELETE FROM sharing;")
-    cur.execute("DELETE FROM duplicate;")
-    readcur = db.cursor()
-    readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;")
-    for hashvalue, in fetchiter(readcur):
-        cur.execute("SELECT content.pid, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;",
-                    (hashvalue,))
-        rows = cur.fetchall()
-        print("processing hash %s with %d entries" % (hashvalue, len(rows)))
-        pkgdict = compute_pkgdict(rows)
-        cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);",
-                        [(row[1],) for row in rows])
-        process_pkgdict(cur, pkgdict)
-    db.commit()
+    db = sqlalchemy.create_engine("sqlite:///test.sqlite3")
+    enable_sqlite_foreign_keys(db)
+    with db.begin() as conn:
+        conn.execute("DELETE FROM sharing;")
+        conn.execute("DELETE FROM duplicate;")
+        readcur = conn.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;")
+        for hashvalue, in fetchiter(readcur):
+            rows = conn.execute("SELECT content.pid, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = :hashvalue;",
+                                hashvalue=hashvalue).fetchall()
+            print("processing hash %s with %d entries" % (hashvalue, len(rows)))
+            pkgdict = compute_pkgdict(rows)
+            for row in rows:
+                cid = row[1]
+                already = conn.scalar("SELECT cid FROM duplicate WHERE cid = :cid;",
+                                      cid=cid)
+                if not already:
+                    conn.execute("INSERT INTO duplicate (cid) VALUES (:cid);",
+                                 cid=cid)
+            process_pkgdict(conn, pkgdict)
 
 if __name__ == "__main__":
     main()
diff --git a/webapp.py b/webapp.py
index 9e23128..2ed84bb 100755
--- a/webapp.py
+++ b/webapp.py
@@ -2,10 +2,10 @@
 
 import datetime
 import os.path
-import sqlite3
 from wsgiref.simple_server import make_server
 
 import jinja2
+import sqlalchemy
 from werkzeug.exceptions import HTTPException, NotFound
 from werkzeug.routing import Map, Rule, RequestRedirect
 from werkzeug.wrappers import Request, Response
@@ -247,44 +247,44 @@ class Application(object):
             return e
 
     def get_details(self, package):
-        cur = self.db.cursor()
-        cur.execute("SELECT id, version, architecture FROM package WHERE name = ?;",
-                    (package,))
-        row = cur.fetchone()
-        if not row:
-            raise NotFound()
-        pid, version, architecture = row
-        details = dict(pid=pid,
-                       package=package,
-                       version=version,
-                       architecture=architecture)
-        cur.execute("SELECT count(filename), sum(size) FROM content WHERE pid = ?;",
-                    (pid,))
-        num_files, total_size = cur.fetchone()
+        with self.db.begin() as conn:
+            row = conn.execute("SELECT id, version, architecture FROM package WHERE name = :name;",
+                               name=package).fetchone()
+            if not row:
+                raise NotFound()
+            pid, version, architecture = row
+            row = conn.execute("SELECT count(filename), sum(size) FROM content WHERE pid = :pid;",
+                               pid=pid).fetchone()
+            num_files, total_size = row
         if total_size is None:
             total_size = 0
-        details.update(dict(num_files=num_files, total_size=total_size))
-        return details
+        return dict(pid=pid,
+                    package=package,
+                    version=version,
+                    architecture=architecture,
+                    num_files=num_files,
+                    total_size=total_size)
 
     def get_dependencies(self, pid):
-        cur = self.db.cursor()
-        cur.execute("SELECT required FROM dependency WHERE pid = ?;",
-                    (pid,))
-        return set(row[0] for row in fetchiter(cur))
+        with self.db.begin() as conn:
+            cur = conn.execute("SELECT required FROM dependency WHERE pid = :pid;",
+                               pid=pid)
+            return set(row[0] for row in fetchiter(cur))
 
     def cached_sharedstats(self, pid):
-        cur = self.db.cursor()
         sharedstats = {}
-        cur.execute("SELECT pid2, package.name, func1, func2, files, size FROM sharing JOIN package ON sharing.pid2 = package.id WHERE pid1 = ?;",
-                    (pid,))
-        for pid2, package2, func1, func2, files, size in fetchiter(cur):
-            if (func1, func2) not in hash_functions:
-                continue
-            curstats = sharedstats.setdefault(
-                    function_combination(func1, func2), list())
-            if pid2 == pid:
-                package2 = None
-            curstats.append(dict(package=package2, duplicate=files, savable=size))
+        with self.db.begin() as conn:
+            cur = conn.execute("SELECT pid2, package.name, func1, func2, files, size FROM sharing JOIN package ON sharing.pid2 = package.id WHERE pid1 = :pid;",
+                               pid=pid)
+            for pid2, package2, func1, func2, files, size in fetchiter(cur):
+                if (func1, func2) not in hash_functions:
+                    continue
+                curstats = sharedstats.setdefault(
+                        function_combination(func1, func2), list())
+                if pid2 == pid:
+                    package2 = None
+                curstats.append(dict(package=package2, duplicate=files,
+                                     savable=size))
         return sharedstats
 
     def show_package(self, package):
@@ -304,35 +304,32 @@ class Application(object):
          * matches: A mapping from filenames in package 2 (pid2) to a mapping
            from hash function pairs to hash values.
         """
-        cur = self.db.cursor()
-        cur.execute("SELECT id, filename, size, hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid WHERE pid = ? AND function = 'sha512' ORDER BY size DESC;",
-                    (pid1,))
-        cursize = -1
-        files = dict()
-        minmatch = 2 if pid1 == pid2 else 1
-        for cid, filename, size, hashvalue in fetchiter(cur):
-            if cursize != size:
-                for entry in files.values():
-                    if len(entry["matches"]) >= minmatch:
-                        yield entry
-                files.clear()
-                cursize = size
-
-            if hashvalue in files:
-                files[hashvalue]["filenames"].add(filename)
-                continue
-
-            entry = dict(filenames=set((filename,)), size=size, matches={})
-            files[hashvalue] = entry
-
-            cur2 = self.db.cursor()
-            cur2.execute("SELECT ha.function, ha.hash, hb.function, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id WHERE ha.cid = ? AND pid = ?;",
-                         (cid, pid2))
-            for func1, hashvalue, func2, filename in fetchiter(cur2):
-                entry["matches"].setdefault(filename, {})[func1, func2] = \
-                        hashvalue
-            cur2.close()
-        cur.close()
+        with self.db.begin() as conn:
+            cur = conn.execute("SELECT id, filename, size, hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid WHERE pid = :pid AND function = 'sha512' ORDER BY size DESC;",
+                               pid=pid1)
+            cursize = -1
+            files = dict()
+            minmatch = 2 if pid1 == pid2 else 1
+            for cid, filename, size, hashvalue in fetchiter(cur):
+                if cursize != size:
+                    for entry in files.values():
+                        if len(entry["matches"]) >= minmatch:
+                            yield entry
+                    files.clear()
+                    cursize = size
+
+                if hashvalue in files:
+                    files[hashvalue]["filenames"].add(filename)
+                    continue
+
+                entry = dict(filenames=set((filename,)), size=size, matches={})
+                files[hashvalue] = entry
+
+                cur = conn.execute("SELECT ha.function, ha.hash, hb.function, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id WHERE ha.cid = :cid AND pid = :pid;",
+                                   cid=cid, pid=pid2)
+                for func1, hashvalue, func2, filename in fetchiter(cur):
+                    entry["matches"].setdefault(filename, {})[func1, func2] = \
+                            hashvalue
 
         for entry in files.values():
             if len(entry["matches"]) >= minmatch:
@@ -352,13 +349,13 @@ class Application(object):
         return html_response(detail_template.stream(params))
 
     def show_hash(self, function, hashvalue):
-        cur = self.db.cursor()
-        cur.execute("SELECT package.name, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id WHERE hash = ?;",
-                    (hashvalue,))
-        entries = [dict(package=package, filename=filename, size=size,
-                        function=otherfunc)
-                   for package, filename, size, otherfunc in fetchiter(cur)
-                   if (function, otherfunc) in hash_functions]
+        with self.db.begin() as conn:
+            cur = conn.execute("SELECT package.name, content.filename, content.size, hash.function FROM content JOIN hash ON hash.cid = content.id JOIN package ON content.pid = package.id WHERE hash = :hashvalue;",
+                               hashvalue=hashvalue)
+            entries = [dict(package=package, filename=filename, size=size,
+                            function=otherfunc)
+                       for package, filename, size, otherfunc in fetchiter(cur)
+                       if (function, otherfunc) in hash_functions]
         if not entries:
             raise NotFound()
         params = dict(function=function, hashvalue=hashvalue, entries=entries,
@@ -366,26 +363,27 @@ class Application(object):
         return html_response(hash_template.render(params))
 
     def show_source(self, package):
-        cur = self.db.cursor()
-        cur.execute("SELECT name FROM package WHERE source = ?;",
-                    (package,))
-        binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur))
-        if not binpkgs:
-            raise NotFound
-        cur.execute("SELECT p1.name, p2.name, sharing.func1, sharing.func2, sharing.files, sharing.size FROM sharing JOIN package AS p1 ON sharing.pid1 = p1.id JOIN package AS p2 ON sharing.pid2 = p2.id WHERE p1.source = ?;",
-                    (package,))
-        for binary, otherbin, func1, func2, files, size in fetchiter(cur):
-            entry = dict(package=otherbin,
-                         funccomb=function_combination(func1, func2),
-                         duplicate=files, savable=size)
-            oldentry = binpkgs.get(binary)
-            if not (oldentry and oldentry["savable"] >= size):
-                binpkgs[binary] = entry
+        with self.db.begin() as conn:
+            cur = conn.execute("SELECT name FROM package WHERE source = :source;",
+                               source=package)
+            binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur))
+            if not binpkgs:
+                raise NotFound
+            cur = conn.execute("SELECT p1.name, p2.name, sharing.func1, sharing.func2, sharing.files, sharing.size FROM sharing JOIN package AS p1 ON sharing.pid1 = p1.id JOIN package AS p2 ON sharing.pid2 = p2.id WHERE p1.source = :source;",
+                               source=package)
+            for binary, otherbin, func1, func2, files, size in fetchiter(cur):
+                entry = dict(package=otherbin,
+                             funccomb=function_combination(func1, func2),
+                             duplicate=files, savable=size)
+                oldentry = binpkgs.get(binary)
+                if not (oldentry and oldentry["savable"] >= size):
+                    binpkgs[binary] = entry
         params = dict(source=package, packages=binpkgs, urlroot="..")
         return html_response(source_template.render(params))
 
 def main():
-    app = Application(sqlite3.connect("test.sqlite3"))
+    db = sqlalchemy.create_engine("sqlite:///test.sqlite3")
+    app = Application(db)
     staticdir = os.path.join(os.path.dirname(__file__), "static")
     app = SharedDataMiddleware(app, {"/": staticdir})
     make_server("0.0.0.0", 8800, app).serve_forever()