From 7528af6d22d3967be9727f6e2d88dfcbf0f78ce9 Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Fri, 2 Aug 2013 15:21:56 +0200
Subject: model comparability as an equivalence relation

webapp has had a relation hash_functions, that modeled "comparable
functions". Images should not be compares to other files, since it makes
no sense to store them as the RGBA stream, that is being hashed. This
comparability property resembles an equivalence relation. So the
function table gains a column eqclass. Each class is represented by a
number and functions are statically assigned to these classes. Now the
filtering happens in SQL instead of Python.
---
 schema.sql |  4 ++--
 webapp.py  | 21 ++++-----------------
 2 files changed, 6 insertions(+), 19 deletions(-)

diff --git a/schema.sql b/schema.sql
index ddc6ccd..2ab7ca7 100644
--- a/schema.sql
+++ b/schema.sql
@@ -1,7 +1,7 @@
 CREATE TABLE package (id INTEGER PRIMARY KEY, name TEXT UNIQUE, version TEXT, architecture TEXT, source TEXT);
 CREATE TABLE content (id INTEGER PRIMARY KEY, pid INTEGER, filename TEXT, size INTEGER, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE);
-CREATE TABLE function (id INTEGER PRIMARY KEY, name TEXT UNIQUE NOT NULL);
-INSERT INTO function (name) VALUES ("sha512"), ("gzip_sha512"), ("png_sha512"), ("gif_sha512");
+CREATE TABLE function (id INTEGER PRIMARY KEY, name TEXT UNIQUE NOT NULL, eqclass INTEGER);
+INSERT INTO function (name, eqclass) VALUES ("sha512", 1), ("gzip_sha512", 1), ("png_sha512", 2), ("gif_sha512", 2);
 CREATE TABLE hash (cid INTEGER, fid INTEGER NOT NULL, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE, FOREIGN KEY (fid) REFERENCES function(id));
 CREATE TABLE dependency (pid INTEGER, required TEXT, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE);
 CREATE INDEX content_package_size_index ON content (pid, size);
diff --git a/webapp.py b/webapp.py
index 260268a..f202c2e 100755
--- a/webapp.py
+++ b/webapp.py
@@ -12,16 +12,6 @@ from werkzeug.wsgi import SharedDataMiddleware
 
 from dedup.utils import fetchiter
 
-hash_functions = [
-        ("sha512", "sha512"),
-        ("png_sha512", "png_sha512"),
-        ("png_sha512", "gif_sha512"),
-        ("gif_sha512", "png_sha512"),
-        ("gif_sha512", "gif_sha512"),
-        ("gzip_sha512", "gzip_sha512"),
-        ("sha512", "gzip_sha512"),
-        ("gzip_sha512", "sha512")]
-
 jinjaenv = jinja2.Environment(loader=jinja2.PackageLoader("dedup", "templates"))
 
 def format_size(size):
@@ -135,11 +125,9 @@ class Application(object):
     def cached_sharedstats(self, pid):
         cur = self.db.cursor()
         sharedstats = {}
-        cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ?;",
+        cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;",
                     (pid,))
         for pid2, package2, func1, func2, files, size in fetchiter(cur):
-            if (func1, func2) not in hash_functions:
-                continue
             curstats = sharedstats.setdefault(
                     function_combination(func1, func2), list())
             if pid2 == pid:
@@ -218,12 +206,11 @@ class Application(object):
 
     def show_hash(self, function, hashvalue):
         cur = self.db.cursor()
-        cur.execute("SELECT package.name, content.filename, content.size, function.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function ON hash.fid = function.id WHERE hash = ?;",
-                    (hashvalue,))
+        cur.execute("SELECT package.name, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;",
+                    (function, hashvalue,))
         entries = [dict(package=package, filename=filename, size=size,
                         function=otherfunc)
-                   for package, filename, size, otherfunc in fetchiter(cur)
-                   if (function, otherfunc) in hash_functions]
+                   for package, filename, size, otherfunc in fetchiter(cur)]
         if not entries:
             raise NotFound()
         params = dict(function=function, hashvalue=hashvalue, entries=entries,
-- 
cgit v1.2.3


From 069f5412dd1ae1f4695a168ae17aded9fb4461fb Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Fri, 16 Aug 2013 22:36:04 +0200
Subject: webapp templates: add an anchor for file issues

---
 dedup/templates/binary.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dedup/templates/binary.html b/dedup/templates/binary.html
index 69eceef..46c4fa6 100644
--- a/dedup/templates/binary.html
+++ b/dedup/templates/binary.html
@@ -21,7 +21,7 @@
 <p>Note: Packages with yellow background are required to be installed when this package is installed.</p>
 {%- endif -%}
 {%- if issues -%}
-    <h3>issues with particular files</h3>
+    <h3 id="issues">issues with particular files</h3>
     <table border='1'><tr><th>filename</th><th>issue</th></tr>
     {%- for filename, issue in issues|dictsort(true) -%}
         <tr><td><span class="filename">{{ filename|e }}</span></td><td>{{ issue|e }}</td></tr>
-- 
cgit v1.2.3


From 1aa2948aaaa2a8e2474918ef57ab84a67d80e804 Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Fri, 16 Aug 2013 22:45:18 +0200
Subject: make debian version_compare available in sql

---
 dedup/utils.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/dedup/utils.py b/dedup/utils.py
index 2fae9fd..6fb233b 100644
--- a/dedup/utils.py
+++ b/dedup/utils.py
@@ -1,3 +1,5 @@
+from debian.debian_support import version_compare
+
 def fetchiter(cursor):
     rows = cursor.fetchmany()
     while rows:
@@ -5,3 +7,6 @@ def fetchiter(cursor):
             yield row
         rows = cursor.fetchmany()
 
+def sql_add_version_compare(db):
+    db.create_collation("debian_version", version_compare)
+    db.create_function("debian_version_compare", 2, version_compare)
-- 
cgit v1.2.3


From 5780f81524f0ae0957380b16db586090a181eaa0 Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Mon, 19 Aug 2013 11:52:39 +0200
Subject: importpkg: don't blacklist boring gzip_sha512 hashes

 * In practise there are very few compressed files with trivial hashes.
 * Blacklisting these values results in false positives in the gzip
   issues.
---
 importpkg.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/importpkg.py b/importpkg.py
index 182ca01..1334dd6 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -42,7 +42,8 @@ def gziphash():
     hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
     hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
     hashobj.name = "gzip_sha512"
-    return HashBlacklist(hashobj, boring_sha512_hashes)
+    # don't blacklist boring hashes for gzip to get gzip issues right
+    return hashobj
 
 def pnghash():
     hashobj = PNGHash(hashlib.sha512())
-- 
cgit v1.2.3


From 3134b18dd8e4932b03b87453e6ee4b4a93b5595f Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Mon, 2 Sep 2013 09:30:05 +0200
Subject: importpkg: move library-like parts to dedup.debpkg

---
 dedup/debpkg.py | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 importpkg.py    | 55 +++++++++----------------------------------------------
 2 files changed, 64 insertions(+), 46 deletions(-)
 create mode 100644 dedup/debpkg.py

diff --git a/dedup/debpkg.py b/dedup/debpkg.py
new file mode 100644
index 0000000..d8cc22f
--- /dev/null
+++ b/dedup/debpkg.py
@@ -0,0 +1,55 @@
+from debian import deb822
+
+from dedup.hashing import hash_file
+
+def process_control(control_contents):
+    """Parses the contents of a control file from a control.tar.gz of a Debian
+    package and returns a dictionary containing the fields relevant to dedup.
+    @type control_contents: bytes
+    @rtype: {str: object}
+    """
+    control = deb822.Packages(control_contents)
+    package = control["package"].encode("ascii")
+    try:
+        source = control["source"].encode("ascii").split()[0]
+    except KeyError:
+        source = package
+    version = control["version"].encode("ascii")
+    architecture = control["architecture"].encode("ascii")
+
+    depends = set(dep[0]["name"].encode("ascii")
+                  for dep in control.relations.get("depends", ())
+                  if len(dep) == 1)
+    return dict(package=package, source=source, version=version,
+                architecture=architecture, depends=depends)
+
+class MultiHash(object):
+    def __init__(self, *hashes):
+        self.hashes = hashes
+
+    def update(self, data):
+        for hasher in self.hashes:
+            hasher.update(data)
+
+def get_tar_hashes(tar, hash_functions):
+    """Given a TarFile read all regular files and compute all of the given hash
+    functions on each file.
+    @type tar: tarfile.TarFile
+    @param hash_functions: a sequence of parameter-less functions each creating a
+            new hashlib-like object
+    @rtype: gen((str, int, {str: str}}
+    @returns: an iterable of (filename, filesize, hashes) tuples where
+            hashes is a dict mapping hash function names to hash values
+    """
+
+    for elem in tar:
+        if not elem.isreg(): # excludes hard links as well
+            continue
+        hasher = MultiHash(*[func() for func in hash_functions])
+        hasher = hash_file(hasher, tar.extractfile(elem))
+        hashes = {}
+        for hashobj in hasher.hashes:
+            hashvalue = hashobj.hexdigest()
+            if hashvalue:
+                hashes[hashobj.name] = hashvalue
+        yield (elem.name, elem.size, hashes)
diff --git a/importpkg.py b/importpkg.py
index 1334dd6..54f6181 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -11,24 +11,16 @@ import sys
 import tarfile
 import zlib
 
-from debian import deb822
 import lzma
 import yaml
 
 from dedup.arreader import ArReader
+from dedup.debpkg import process_control, get_tar_hashes
 from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \
-    HashedStream, hash_file
+    HashedStream
 from dedup.compression import GzipDecompressor, DecompressedStream
 from dedup.image import GIFHash, PNGHash
 
-class MultiHash(object):
-    def __init__(self, *hashes):
-        self.hashes = hashes
-
-    def update(self, data):
-        for hasher in self.hashes:
-            hasher.update(data)
-
 boring_sha512_hashes = set((
     # ""
     "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e",
@@ -57,37 +49,7 @@ def gifhash():
     hashobj.name = "gif_sha512"
     return hashobj
 
-def get_hashes(tar):
-    for elem in tar:
-        if not elem.isreg(): # excludes hard links as well
-            continue
-        hasher = MultiHash(sha512_nontrivial(), gziphash(), pnghash(),
-                           gifhash())
-        hasher = hash_file(hasher, tar.extractfile(elem))
-        hashes = {}
-        for hashobj in hasher.hashes:
-            hashvalue = hashobj.hexdigest()
-            if hashvalue:
-                hashes[hashobj.name] = hashvalue
-        yield (elem.name, elem.size, hashes)
-
-def process_control(control_contents):
-    control = deb822.Packages(control_contents)
-    package = control["package"].encode("ascii")
-    try:
-        source = control["source"].encode("ascii").split()[0]
-    except KeyError:
-        source = package
-    version = control["version"].encode("ascii")
-    architecture = control["architecture"].encode("ascii")
-
-    depends = set(dep[0]["name"].encode("ascii")
-                  for dep in control.relations.get("depends", ())
-                  if len(dep) == 1)
-    return dict(package=package, source=source, version=version,
-                architecture=architecture, depends=depends)
-
-def process_package(filelike):
+def process_package(filelike, hash_functions):
     af = ArReader(filelike)
     af.read_magic()
     state = "start"
@@ -123,7 +85,7 @@ def process_package(filelike):
             continue
         if state != "control_file":
             raise ValueError("missing control file")
-        for name, size, hashes in get_hashes(tf):
+        for name, size, hashes in get_tar_hashes(tf, hash_functions):
             try:
                 name = name.decode("utf8")
             except UnicodeDecodeError:
@@ -133,9 +95,9 @@ def process_package(filelike):
         yield "commit"
         break
 
-def process_package_with_hash(filelike, sha256hash):
+def process_package_with_hash(filelike, hash_functions, sha256hash):
     hstream = HashedStream(filelike, hashlib.sha256())
-    for elem in process_package(hstream):
+    for elem in process_package(hstream, hash_functions):
         if elem == "commit":
             while hstream.read(4096):
                 pass
@@ -150,10 +112,11 @@ def main():
     parser.add_option("-H", "--hash", action="store",
                       help="verify that stdin hash given sha256 hash")
     options, args = parser.parse_args()
+    hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash]
     if options.hash:
-        gen = process_package_with_hash(sys.stdin, options.hash)
+        gen = process_package_with_hash(sys.stdin, hash_functions, options.hash)
     else:
-        gen = process_package(sys.stdin)
+        gen = process_package(sys.stdin, hash_functions)
     yaml.safe_dump_all(gen, sys.stdout)
 
 if __name__ == "__main__":
-- 
cgit v1.2.3


From c0cd9df1e6cb63d524939028f5f6a07c2c8c3da5 Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Mon, 2 Sep 2013 10:00:44 +0200
Subject: autoimport: avoid hard coded temporary directory

---
 autoimport.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/autoimport.py b/autoimport.py
index 481a3f8..a0681b3 100755
--- a/autoimport.py
+++ b/autoimport.py
@@ -4,12 +4,14 @@ packages contained. It has rather strong assumptions on the working directory.
 """
 
 import gzip
+import errno
 import io
 import multiprocessing
 import optparse
 import os
 import sqlite3
 import subprocess
+import tempfile
 import urllib
 
 import concurrent.futures
@@ -52,14 +54,14 @@ def process_dir(pkgs, d):
         except ValueError:
             pass
 
-def process_pkg(name, pkgdict):
+def process_pkg(name, pkgdict, outpath):
     filename = pkgdict["filename"]
     print("importing %s" % filename)
     importcmd = ["python", "importpkg.py"]
     if "sha256hash" in pkgdict:
         importcmd.extend(["-H", pkgdict["sha256hash"]])
     if filename.startswith("http://"):
-        with open(os.path.join("tmp", name), "w") as outp:
+        with open(outpath, "w") as outp:
             dl = subprocess.Popen(["curl", "-s", filename],
                                   stdout=subprocess.PIPE, close_fds=True)
             imp = subprocess.Popen(importcmd, stdin=dl.stdout, stdout=outp,
@@ -70,7 +72,7 @@ def process_pkg(name, pkgdict):
                 raise ValueError("curl failed")
     else:
         with open(filename) as inp:
-            with open(os.path.join("tmp", name), "w") as outp:
+            with open(outpath, "w") as outp:
                 subprocess.check_call(importcmd, stdin=inp, stdout=outp,
                                       close_fds=True)
     print("preprocessed %s" % name)
@@ -82,7 +84,7 @@ def main():
     parser.add_option("-p", "--prune", action="store_true",
                       help="prune packages old packages")
     options, args = parser.parse_args()
-    subprocess.check_call(["mkdir", "-p", "tmp"])
+    tmpdir = tempfile.mkdtemp(prefix=b"debian-dedup")
     db = sqlite3.connect("test.sqlite3")
     cur = db.cursor()
     cur.execute("PRAGMA foreign_keys = ON;")
@@ -111,14 +113,15 @@ def main():
     with e:
         fs = {}
         for name, pkg in pkgs.items():
-            fs[e.submit(process_pkg, name, pkg)] = name
+            outpath = os.path.join(tmpdir, name)
+            fs[e.submit(process_pkg, name, pkg, outpath)] = name
 
         for f in concurrent.futures.as_completed(fs.keys()):
             name = fs[f]
             if f.exception():
                 print("%s failed to import: %r" % (name, f.exception()))
                 continue
-            inf = os.path.join("tmp", name)
+            inf = os.path.join(tmpdir, name)
             print("sqlimporting %s" % name)
             with open(inf) as inp:
                 try:
@@ -136,6 +139,13 @@ def main():
         # Tables content, dependency and sharing will also be pruned
         # due to ON DELETE CASCADE clauses.
         db.commit()
+    try:
+        os.rmdir(tmpdir)
+    except OSError as err:
+        if err.errno != errno.ENOTEMPTY:
+            raise
+        print("keeping temporary directory %s due to failed packages %s" %
+              (tmpdir, " ".join(os.listdir(tmpdir))))
 
 if __name__ == "__main__":
     main()
-- 
cgit v1.2.3


From 022985f098a206c3b7852fe08a798cd31623f10d Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Mon, 2 Sep 2013 18:51:20 +0200
Subject: add option -d --database for db path to all scripts

---
 autoimport.py     |  5 ++++-
 readyaml.py       |  8 +++++++-
 update_sharing.py | 11 ++++++++---
 webapp.py         |  8 +++++++-
 4 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/autoimport.py b/autoimport.py
index a0681b3..d44c012 100755
--- a/autoimport.py
+++ b/autoimport.py
@@ -83,9 +83,12 @@ def main():
                       help="avoid reimporting same versions")
     parser.add_option("-p", "--prune", action="store_true",
                       help="prune packages old packages")
+    parser.add_option("-d", "--database", action="store",
+                      default="test.sqlite3",
+                      help="path to the sqlite3 database file")
     options, args = parser.parse_args()
     tmpdir = tempfile.mkdtemp(prefix=b"debian-dedup")
-    db = sqlite3.connect("test.sqlite3")
+    db = sqlite3.connect(options.database)
     cur = db.cursor()
     cur.execute("PRAGMA foreign_keys = ON;")
     e = concurrent.futures.ThreadPoolExecutor(multiprocessing.cpu_count())
diff --git a/readyaml.py b/readyaml.py
index 21b1ca1..2ef9a3b 100755
--- a/readyaml.py
+++ b/readyaml.py
@@ -2,6 +2,7 @@
 """This tool reads a yaml file as generated by importpkg.py on stdin and
 updates the database with the contents."""
 
+import optparse
 import sqlite3
 import sys
 
@@ -53,7 +54,12 @@ def readyaml(db, stream):
     raise ValueError("missing commit block")
 
 def main():
-    db = sqlite3.connect("test.sqlite3")
+    parser = optparse.OptionParser()
+    parser.add_option("-d", "--database", action="store",
+                      default="test.sqlite3",
+                      help="path to the sqlite3 database file")
+    options, args = parser.parse_args()
+    db = sqlite3.connect(options.database)
     readyaml(db, sys.stdin)
 
 if __name__ == "__main__":
diff --git a/update_sharing.py b/update_sharing.py
index 5ec6c7b..1ff0fd8 100755
--- a/update_sharing.py
+++ b/update_sharing.py
@@ -1,5 +1,6 @@
 #!/usr/bin/python
 
+import optparse
 import sqlite3
 
 from dedup.utils import fetchiter
@@ -37,8 +38,7 @@ def process_pkgdict(cursor, pkgdict):
                     insert_key = (pid1, pid2, fid1, fid2)
                     add_values(cursor, insert_key, pkgnumfiles, pkgsize)
 
-def main():
-    db = sqlite3.connect("test.sqlite3")
+def main(db):
     cur = db.cursor()
     cur.execute("PRAGMA foreign_keys = ON;")
     cur.execute("DELETE FROM sharing;")
@@ -61,4 +61,9 @@ def main():
     db.commit()
 
 if __name__ == "__main__":
-    main()
+    parser = optparse.OptionParser()
+    parser.add_option("-d", "--database", action="store",
+                      default="test.sqlite3",
+                      help="path to the sqlite3 database file")
+    options, args = parser.parse_args()
+    main(sqlite3.connect(options.database))
diff --git a/webapp.py b/webapp.py
index f202c2e..632b485 100755
--- a/webapp.py
+++ b/webapp.py
@@ -1,6 +1,7 @@
 #!/usr/bin/python
 
 import datetime
+import optparse
 import sqlite3
 from wsgiref.simple_server import make_server
 
@@ -237,7 +238,12 @@ class Application(object):
         return html_response(source_template.render(params))
 
 def main():
-    app = Application(sqlite3.connect("test.sqlite3"))
+    parser = optparse.OptionParser()
+    parser.add_option("-d", "--database", action="store",
+                      default="test.sqlite3",
+                      help="path to the sqlite3 database file")
+    options, args = parser.parse_args()
+    app = Application(sqlite3.connect(options.database))
     app = SharedDataMiddleware(app, {"/": ("dedup", "static")})
     make_server("0.0.0.0", 8800, app).serve_forever()
 
-- 
cgit v1.2.3


From 49cac8bdae0ec787372d227545411ef14905d6a8 Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Wed, 4 Sep 2013 10:15:59 +0200
Subject: webapp: serve static files from /static

---
 dedup/templates/base.html | 4 ++--
 webapp.py                 | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/dedup/templates/base.html b/dedup/templates/base.html
index 62f4087..9dfb788 100644
--- a/dedup/templates/base.html
+++ b/dedup/templates/base.html
@@ -3,8 +3,8 @@
 	<head>
 		<title>{% block title %}{% endblock %}</title>
 		<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
-		<link  rel="stylesheet" type="text/css" href="{{ urlroot|e }}/style.css">
-		<link rel="icon" type="image/vnd.microsoft.icon" href="{{ urlroot|e }}/favicon.ico">
+		<link rel="stylesheet" type="text/css" href="{{ urlroot|e }}/static/style.css">
+		<link rel="icon" type="image/vnd.microsoft.icon" href="{{ urlroot|e }}/static/favicon.ico">
 		{% block header %}{% endblock %}
 	</head>
 	<body>
diff --git a/webapp.py b/webapp.py
index 632b485..d5f076e 100755
--- a/webapp.py
+++ b/webapp.py
@@ -244,7 +244,7 @@ def main():
                       help="path to the sqlite3 database file")
     options, args = parser.parse_args()
     app = Application(sqlite3.connect(options.database))
-    app = SharedDataMiddleware(app, {"/": ("dedup", "static")})
+    app = SharedDataMiddleware(app, {"/static": ("dedup", "static")})
     make_server("0.0.0.0", 8800, app).serve_forever()
 
 if __name__ == "__main__":
-- 
cgit v1.2.3


From ca65a78a9ace0aeb2565df0da171727c04c33970 Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Tue, 10 Sep 2013 09:39:40 +0200
Subject: webapp: close database cursors

Leaking them can result in running out of available filedescriptors.
---
 webapp.py | 98 ++++++++++++++++++++++++++++++++-------------------------------
 1 file changed, 50 insertions(+), 48 deletions(-)

diff --git a/webapp.py b/webapp.py
index d5f076e..4478ba0 100755
--- a/webapp.py
+++ b/webapp.py
@@ -1,5 +1,6 @@
 #!/usr/bin/python
 
+import contextlib
 import datetime
 import optparse
 import sqlite3
@@ -98,42 +99,43 @@ class Application(object):
             return e
 
     def get_details(self, package):
-        cur = self.db.cursor()
-        cur.execute("SELECT id, version, architecture FROM package WHERE name = ?;",
-                    (package,))
-        row = cur.fetchone()
-        if not row:
-            raise NotFound()
-        pid, version, architecture = row
-        details = dict(pid=pid,
-                       package=package,
-                       version=version,
-                       architecture=architecture)
-        cur.execute("SELECT count(filename), sum(size) FROM content WHERE pid = ?;",
-                    (pid,))
-        num_files, total_size = cur.fetchone()
+        with contextlib.closing(self.db.cursor()) as cur:
+            cur.execute("SELECT id, version, architecture FROM package WHERE name = ?;",
+                        (package,))
+            row = cur.fetchone()
+            if not row:
+                raise NotFound()
+            pid, version, architecture = row
+            details = dict(pid=pid,
+                           package=package,
+                           version=version,
+                           architecture=architecture)
+            cur.execute("SELECT count(filename), sum(size) FROM content WHERE pid = ?;",
+                        (pid,))
+            num_files, total_size = cur.fetchone()
         if total_size is None:
             total_size = 0
         details.update(dict(num_files=num_files, total_size=total_size))
         return details
 
     def get_dependencies(self, pid):
-        cur = self.db.cursor()
-        cur.execute("SELECT required FROM dependency WHERE pid = ?;",
-                    (pid,))
-        return set(row[0] for row in fetchiter(cur))
+        with contextlib.closing(self.db.cursor()) as cur:
+            cur.execute("SELECT required FROM dependency WHERE pid = ?;",
+                        (pid,))
+            return set(row[0] for row in fetchiter(cur))
 
     def cached_sharedstats(self, pid):
-        cur = self.db.cursor()
         sharedstats = {}
-        cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;",
-                    (pid,))
-        for pid2, package2, func1, func2, files, size in fetchiter(cur):
-            curstats = sharedstats.setdefault(
-                    function_combination(func1, func2), list())
-            if pid2 == pid:
-                package2 = None
-            curstats.append(dict(package=package2, duplicate=files, savable=size))
+        with contextlib.closing(self.db.cursor()) as cur:
+            cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;",
+                        (pid,))
+            for pid2, package2, func1, func2, files, size in fetchiter(cur):
+                curstats = sharedstats.setdefault(
+                        function_combination(func1, func2), list())
+                if pid2 == pid:
+                    package2 = None
+                curstats.append(dict(package=package2, duplicate=files,
+                                     savable=size))
         return sharedstats
 
     def show_package(self, package):
@@ -206,12 +208,12 @@ class Application(object):
         return html_response(detail_template.stream(params))
 
     def show_hash(self, function, hashvalue):
-        cur = self.db.cursor()
-        cur.execute("SELECT package.name, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;",
-                    (function, hashvalue,))
-        entries = [dict(package=package, filename=filename, size=size,
-                        function=otherfunc)
-                   for package, filename, size, otherfunc in fetchiter(cur)]
+        with contextlib.closing(self.db.cursor()) as cur:
+            cur.execute("SELECT package.name, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;",
+                        (function, hashvalue,))
+            entries = [dict(package=package, filename=filename, size=size,
+                            function=otherfunc)
+                       for package, filename, size, otherfunc in fetchiter(cur)]
         if not entries:
             raise NotFound()
         params = dict(function=function, hashvalue=hashvalue, entries=entries,
@@ -219,21 +221,21 @@ class Application(object):
         return html_response(hash_template.render(params))
 
     def show_source(self, package):
-        cur = self.db.cursor()
-        cur.execute("SELECT name FROM package WHERE source = ?;",
-                    (package,))
-        binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur))
-        if not binpkgs:
-            raise NotFound
-        cur.execute("SELECT p1.name, p2.name, f1.name, f2.name, sharing.files, sharing.size FROM sharing JOIN package AS p1 ON sharing.pid1 = p1.id JOIN package AS p2 ON sharing.pid2 = p2.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE p1.source = ?;",
-                    (package,))
-        for binary, otherbin, func1, func2, files, size in fetchiter(cur):
-            entry = dict(package=otherbin,
-                         funccomb=function_combination(func1, func2),
-                         duplicate=files, savable=size)
-            oldentry = binpkgs.get(binary)
-            if not (oldentry and oldentry["savable"] >= size):
-                binpkgs[binary] = entry
+        with contextlib.closing(self.db.cursor()) as cur:
+            cur.execute("SELECT name FROM package WHERE source = ?;",
+                        (package,))
+            binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur))
+            if not binpkgs:
+                raise NotFound
+            cur.execute("SELECT p1.name, p2.name, f1.name, f2.name, sharing.files, sharing.size FROM sharing JOIN package AS p1 ON sharing.pid1 = p1.id JOIN package AS p2 ON sharing.pid2 = p2.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE p1.source = ?;",
+                        (package,))
+            for binary, otherbin, func1, func2, files, size in fetchiter(cur):
+                entry = dict(package=otherbin,
+                             funccomb=function_combination(func1, func2),
+                             duplicate=files, savable=size)
+                oldentry = binpkgs.get(binary)
+                if not (oldentry and oldentry["savable"] >= size):
+                    binpkgs[binary] = entry
         params = dict(source=package, packages=binpkgs, urlroot="..")
         return html_response(source_template.render(params))
 
-- 
cgit v1.2.3


From 786c4f93ea318a3c22479f80531594435fb036c3 Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Wed, 11 Sep 2013 08:35:41 +0200
Subject: webapp: open cursors less often

On the main instance opening cursors equals initiating a connection.
Unfortunately sqlite3.Connection.close does not close filedescriptors.
So just open less cursors to leak filedescriptors less often.
---
 webapp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/webapp.py b/webapp.py
index 4478ba0..665ac23 100755
--- a/webapp.py
+++ b/webapp.py
@@ -166,6 +166,7 @@ class Application(object):
         cursize = -1
         files = dict()
         minmatch = 2 if pid1 == pid2 else 1
+        cur2 = self.db.cursor()
         for cid, filename, size, hashvalue in fetchiter(cur):
             if cursize != size:
                 for entry in files.values():
@@ -181,13 +182,12 @@ class Application(object):
             entry = dict(filenames=set((filename,)), size=size, matches={})
             files[hashvalue] = entry
 
-            cur2 = self.db.cursor()
             cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ?;",
                          (cid, pid2))
             for func1, hashvalue, func2, filename in fetchiter(cur2):
                 entry["matches"].setdefault(filename, {})[func1, func2] = \
                         hashvalue
-            cur2.close()
+        cur2.close()
         cur.close()
 
         for entry in files.values():
-- 
cgit v1.2.3


From d228c0a4a5827325bca47d63ea287c7cb56537ea Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Thu, 3 Oct 2013 08:51:41 +0200
Subject: work around python-debian's #670679

---
 dedup/debpkg.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dedup/debpkg.py b/dedup/debpkg.py
index d8cc22f..2d67135 100644
--- a/dedup/debpkg.py
+++ b/dedup/debpkg.py
@@ -16,8 +16,8 @@ def process_control(control_contents):
         source = package
     version = control["version"].encode("ascii")
     architecture = control["architecture"].encode("ascii")
-
-    depends = set(dep[0]["name"].encode("ascii")
+    # deb822 currently returns :any dependencies raw. see #670679
+    depends = set(dep[0]["name"].split(u':', 1)[0].encode("ascii")
                   for dep in control.relations.get("depends", ())
                   if len(dep) == 1)
     return dict(package=package, source=source, version=version,
-- 
cgit v1.2.3


From 17597b5e828f9bbc9b0159102b173c284c23a140 Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Wed, 19 Feb 2014 07:54:21 +0100
Subject: DecompressedHash should fail on trailing input

Otherwise all files smaller than 10 bytes are successfully hashed to the
hash of the empty input when using the GzipDecompressor.

Reported-By: Olly Betts
---
 dedup/hashing.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/dedup/hashing.py b/dedup/hashing.py
index 002eda8..5f015b2 100644
--- a/dedup/hashing.py
+++ b/dedup/hashing.py
@@ -49,9 +49,13 @@ class DecompressedHash(object):
 
     def hexdigest(self):
         if not hasattr(self.decompressor, "flush"):
+            if self.decompressor.unused_data:
+                raise ValueError("decompressor did not consume all data")
             return self.hashobj.hexdigest()
         tmpdecomp = self.decompressor.copy()
         data = tmpdecomp.flush()
+        if tmpdecomp.unused_data:
+            raise ValueError("decompressor did not consume all data")
         tmphash = self.hashobj.copy()
         tmphash.update(data)
         return tmphash.hexdigest()
-- 
cgit v1.2.3


From d467a2a4e85d4b6f09bd2e3dc70466bfcc45a577 Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Wed, 19 Feb 2014 14:19:56 +0100
Subject: GzipDecompressor: don't treat checksum as garbage trailer

---
 dedup/compression.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/dedup/compression.py b/dedup/compression.py
index 869c49f..4ce258c 100644
--- a/dedup/compression.py
+++ b/dedup/compression.py
@@ -5,8 +5,11 @@ class GzipDecompressor(object):
     """An interface to gzip which is similar to bz2.BZ2Decompressor and
     lzma.LZMADecompressor."""
     def __init__(self):
+        self.sawheader = False
         self.inbuffer = b""
         self.decompressor = None
+        self.crc = 0
+        self.size = 0
 
     def decompress(self, data):
         """
@@ -16,6 +19,8 @@ class GzipDecompressor(object):
         while True:
             if self.decompressor:
                 data = self.decompressor.decompress(data)
+                self.crc = zlib.crc32(data, self.crc)
+                self.size += len(data)
                 unused_data = self.decompressor.unused_data
                 if not unused_data:
                     return data
@@ -45,13 +50,20 @@ class GzipDecompressor(object):
                 return b""
             data = self.inbuffer[skip:]
             self.inbuffer = b""
+            self.sawheader = True
             self.decompressor = zlib.decompressobj(-zlib.MAX_WBITS)
 
     @property
     def unused_data(self):
         if self.decompressor:
             return self.decompressor.unused_data
+        elif not self.sawheader:
+            return self.inbuffer
         else:
+            expect = struct.pack("<ll", self.crc, self.size)
+            if self.inbuffer.startswith(expect) and \
+                    self.inbuffer[len(expect):].replace("\0", "") == "":
+                return b""
             return self.inbuffer
 
     def flush(self):
@@ -67,6 +79,9 @@ class GzipDecompressor(object):
         new.inbuffer = self.inbuffer
         if self.decompressor:
             new.decompressor = self.decompressor.copy()
+        new.sawheader = self.sawheader
+        new.crc = self.crc
+        new.size = self.size
         return new
 
 class DecompressedStream(object):
-- 
cgit v1.2.3


From 332ac9eafb235443f163c606ced95dcbd615815e Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Wed, 19 Feb 2014 14:21:20 +0100
Subject: blacklist content rather than hashes

Otherwise the gzip hash cannot tell the empty stream and the
compressed empty stream apart.
---
 dedup/hashing.py | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 importpkg.py     | 15 +++++----------
 2 files changed, 55 insertions(+), 10 deletions(-)

diff --git a/dedup/hashing.py b/dedup/hashing.py
index 5f015b2..70f6268 100644
--- a/dedup/hashing.py
+++ b/dedup/hashing.py
@@ -1,3 +1,5 @@
+import itertools
+
 class HashBlacklist(object):
     """Turn a hashlib-like object into a hash that returns None for some
     blacklisted hashes instead of the real hash value.
@@ -29,6 +31,54 @@ class HashBlacklist(object):
     def copy(self):
         return HashBlacklist(self.hashobj.copy(), self.blacklist)
 
+class HashBlacklistContent(object):
+    """Turn a hashlib-like object into a hash that returns None for some
+    blacklisted content instead of the real hash value. Unlike HashBlacklist,
+    not the output of the hash is considered, but its input."""
+
+    def __init__(self, hashobj, blacklist=(), maxlen=None):
+        """
+        @param hashobj: a hashlib-like object
+        @param blacklist: an object providing __contains__.
+            hash inputs which are contained in the blacklist
+            are turned into None values
+        @param maxlen: the maximum length of a blacklisted input.
+            Defaults to max(map(len, blacklist)), so if it is absent,
+            the blacklist must support iteration.
+        """
+        self.hashobj = hashobj
+        self.blacklist = blacklist
+        if maxlen is None:
+            # the chain avoids passing the empty sequence to max
+            maxlen = max(itertools.chain((0,), itertools.imap(len, blacklist)))
+        self.maxlen = maxlen
+        self.stored = ""
+
+    @property
+    def name(self):
+        return self.hashobj.name
+
+    def update(self, data):
+        if self.stored is not None:
+            self.stored += data
+            if len(self.stored) > self.maxlen:
+                self.stored = None
+        self.hashobj.update(data)
+
+    def digest(self):
+        if self.stored is not None and self.stored in self.blacklist:
+            return None
+        return self.hashobj.digest()
+
+    def hexdigest(self):
+        if self.stored is not None and self.stored in self.blacklist:
+            return None
+        return self.hashobj.hexdigest()
+
+    def copy(self):
+        return HashBlacklistContent(self.hashobj.copy(), self.blacklist,
+                                    self.maxlen)
+
 class DecompressedHash(object):
     """Apply a decompression function before the hash. This class provides the
     hashlib interface (update, hexdigest, copy) excluding digest and name."""
diff --git a/importpkg.py b/importpkg.py
index 54f6181..cb16f97 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -16,26 +16,21 @@ import yaml
 
 from dedup.arreader import ArReader
 from dedup.debpkg import process_control, get_tar_hashes
-from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \
-    HashedStream
+from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \
+        HashBlacklistContent
 from dedup.compression import GzipDecompressor, DecompressedStream
 from dedup.image import GIFHash, PNGHash
 
-boring_sha512_hashes = set((
-    # ""
-    "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e",
-    # "\n"
-    "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09"))
+boring_content = set(("", "\n"))
 
 def sha512_nontrivial():
-    return HashBlacklist(hashlib.sha512(), boring_sha512_hashes)
+    return HashBlacklistContent(hashlib.sha512(), boring_content)
 
 def gziphash():
     hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
     hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
     hashobj.name = "gzip_sha512"
-    # don't blacklist boring hashes for gzip to get gzip issues right
-    return hashobj
+    return HashBlacklistContent(hashobj, boring_content)
 
 def pnghash():
     hashobj = PNGHash(hashlib.sha512())
-- 
cgit v1.2.3


From 7389e4b00f6add611e8d6b318654056097d6d546 Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Fri, 21 Feb 2014 21:59:04 +0100
Subject: update_sharing: weaken assumptions about db layout

Hash functions are partitioned into equivalence classes. We are
generally only interested in sharing among hash functions with the same
equivalence class, but the algorithm would compute any sharing. While
the current layout never produces the same hashes for functions in
difference equivalence classes (for different output length), that may
change in future.

Also allow hash functions, that belong to no equivalence class at all
(eqclass = NULL) as a means to add additional metadata to content
without computing any sharing for it.
---
 update_sharing.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/update_sharing.py b/update_sharing.py
index 1ff0fd8..ca6890b 100755
--- a/update_sharing.py
+++ b/update_sharing.py
@@ -47,14 +47,20 @@ def main(db):
     readcur = db.cursor()
     readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;")
     for hashvalue, in fetchiter(readcur):
-        cur.execute("SELECT content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;",
+        cur.execute("SELECT function.eqclass, content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id JOIN function ON hash.fid = function.id AND function.eqclass IS NOT NULL WHERE hash = ?;",
                     (hashvalue,))
-        rows = cur.fetchall()
-        print("processing hash %s with %d entries" % (hashvalue, len(rows)))
-        pkgdict = compute_pkgdict(rows)
-        cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);",
-                        [(row[1],) for row in rows])
-        process_pkgdict(cur, pkgdict)
+        rowdict = dict()
+        for row in cur.fetchall():
+            rowdict.setdefault(row[0], []).append(row[1:])
+        for eqclass, rows in rowdict.items():
+            if len(rows) < 2:
+                print("skipping hash %s class %d with too few entries" % (hashvalue, eqclass))
+                continue
+            print("processing hash %s class %d with %d entries" % (hashvalue, eqclass, len(rows)))
+            pkgdict = compute_pkgdict(rows)
+            cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);",
+                            [(row[1],) for row in rows])
+            process_pkgdict(cur, pkgdict)
     cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');")
     cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';")
     cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';")
-- 
cgit v1.2.3


From b38f14ab3fb72ca1578d7e6bb09178e6fbebba76 Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Sun, 23 Feb 2014 15:44:03 +0100
Subject: webapp: fix eqclass usage in package comparison

When comparing two packages, objects would be considered duplicates
without considering whether the respective hash functions are comparable
by checking their equivalence classes. The current set of hash functions
does not expose this bug.
---
 webapp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webapp.py b/webapp.py
index 665ac23..fd6d685 100755
--- a/webapp.py
+++ b/webapp.py
@@ -182,7 +182,7 @@ class Application(object):
             entry = dict(filenames=set((filename,)), size=size, matches={})
             files[hashvalue] = entry
 
-            cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ?;",
+            cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ? AND fa.eqclass = fb.eqclass;",
                          (cid, pid2))
             for func1, hashvalue, func2, filename in fetchiter(cur2):
                 entry["matches"].setdefault(filename, {})[func1, func2] = \
-- 
cgit v1.2.3


From 8ccd5205f77276b333c56efb8271a0ddf11590a0 Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Sun, 23 Feb 2014 17:29:41 +0100
Subject: fix spelling mistake

Reported-By: Stefan Kaltenbrunner
---
 dedup/templates/index.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dedup/templates/index.html b/dedup/templates/index.html
index 7c9000f..169027e 100644
--- a/dedup/templates/index.html
+++ b/dedup/templates/index.html
@@ -28,7 +28,7 @@
 {% block content %}
 <h1>Debian duplication detector</h1>
 <ul>
-<li>To inspect a particlar binary package, go to <pre>binary/&lt;packagename&gt;</pre> Example: <a href="binary/git">binary/git</a>
+<li>To inspect a particular binary package, go to <pre>binary/&lt;packagename&gt;</pre> Example: <a href="binary/git">binary/git</a>
     <div style="display:none" id="form_div"><fieldset>
             <legend>Inspect package</legend>
             <noscript><b>This form is dysfunctional when javascript is not enabled</b></noscript>
-- 
cgit v1.2.3


From 8d4c5512edbdcdd1063a7e6508f398a5a57981be Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Sun, 23 Feb 2014 18:19:35 +0100
Subject: spell check comments

---
 dedup/hashing.py | 2 +-
 dedup/image.py   | 2 +-
 importpkg.py     | 4 ++--
 webapp.py        | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/dedup/hashing.py b/dedup/hashing.py
index 70f6268..a8a46c7 100644
--- a/dedup/hashing.py
+++ b/dedup/hashing.py
@@ -115,7 +115,7 @@ class DecompressedHash(object):
 
 class SuppressingHash(object):
     """A hash that silences exceptions from the update and hexdigest methods of
-    a hashlib-like object. If an exception has occured, hexdigest always
+    a hashlib-like object. If an exception has occurred, hexdigest always
     returns None."""
     def __init__(self, hashobj, exceptions=()):
         """
diff --git a/dedup/image.py b/dedup/image.py
index c1f2de0..ef17989 100644
--- a/dedup/image.py
+++ b/dedup/image.py
@@ -4,7 +4,7 @@ import struct
 import PIL.Image
 
 class ImageHash(object):
-    """A hash on the contents of an image datat type supported by PIL. This
+    """A hash on the contents of an image data type supported by PIL. This
     disregards mode, depth and meta information. Note that due to limitations
     in PIL and the image format (interlacing) the full contents are stored and
     decoded in hexdigest."""
diff --git a/importpkg.py b/importpkg.py
index cb16f97..aeccda5 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -1,7 +1,7 @@
 #!/usr/bin/python
-"""This tool reads a debian package from stdin and emits a yaml stream on
+"""This tool reads a Debian package from stdin and emits a yaml stream on
 stdout.  It does not access a database. Therefore it can be run in parallel and
-on multiple machines. The generated yaml conatins multiple documents. The first
+on multiple machines. The generated yaml contains multiple documents. The first
 document contains package metadata. Then a document is emitted for each file.
 And finally a document consisting of the string "commit" is emitted."""
 
diff --git a/webapp.py b/webapp.py
index fd6d685..2fd69bb 100755
--- a/webapp.py
+++ b/webapp.py
@@ -151,7 +151,7 @@ class Application(object):
         return html_response(package_template.render(params))
 
     def compute_comparison(self, pid1, pid2):
-        """Compute a sequence of comparison objects ordery by the size of the
+        """Compute a sequence of comparison objects ordered by the size of the
         object in the first package. Each element of the sequence is a dict
         defining the following keys:
          * filenames: A set of filenames in package 1 (pid1) all referring to
-- 
cgit v1.2.3


From bb0aea9971bc79d8787d8f034022d0ca803fcab3 Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Sat, 8 Mar 2014 09:48:17 +0100
Subject: schema: make syntax compatible with postgres

---
 schema.sql | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/schema.sql b/schema.sql
index 2ab7ca7..f6b1a7c 100644
--- a/schema.sql
+++ b/schema.sql
@@ -1,7 +1,7 @@
 CREATE TABLE package (id INTEGER PRIMARY KEY, name TEXT UNIQUE, version TEXT, architecture TEXT, source TEXT);
 CREATE TABLE content (id INTEGER PRIMARY KEY, pid INTEGER, filename TEXT, size INTEGER, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE);
 CREATE TABLE function (id INTEGER PRIMARY KEY, name TEXT UNIQUE NOT NULL, eqclass INTEGER);
-INSERT INTO function (name, eqclass) VALUES ("sha512", 1), ("gzip_sha512", 1), ("png_sha512", 2), ("gif_sha512", 2);
+INSERT INTO function (id, name, eqclass) VALUES (1, 'sha512', 1), (2, 'gzip_sha512', 1), (3, 'png_sha512', 2), (4, 'gif_sha512', 2);
 CREATE TABLE hash (cid INTEGER, fid INTEGER NOT NULL, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE, FOREIGN KEY (fid) REFERENCES function(id));
 CREATE TABLE dependency (pid INTEGER, required TEXT, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE);
 CREATE INDEX content_package_size_index ON content (pid, size);
-- 
cgit v1.2.3