Merge branch 'master' into sqlalchemy

In the mean time, the master branch evolved quite a bit and the schema changed again (eqclass added to function table). The main reason for the merge is to resolve the large amounts of conflicts once, so development of the sqlalchemy branch can continue and still benefit from changes in the master branch such as schema compatibility, adapting the indent level in web app due to the use of contextlib.closing which resembles sqlalchemy's "with db.begin() as conn:". Conflicts: autoimport.py dedup/utils.py readyaml.py update_sharing.py webapp.py
author: Helmut Grohne <helmut@subdivi.de> 2014-03-08 12:39:32 +0100
committer: Helmut Grohne <helmut@subdivi.de> 2014-03-08 12:39:32 +0100
commit: c6a30cefff55cd247a47fa0a2d4f819592e1202b (patch)
tree: 58b6ff52bc6827782c2973f1ce976e245ce5f34c
parent: 751f19ec1107c9059ae4834e4b757741ebee6cbd (diff)
parent: bb0aea9971bc79d8787d8f034022d0ca803fcab3 (diff)
download: debian-dedup-c6a30cefff55cd247a47fa0a2d4f819592e1202b.tar.gz
14 files changed, 222 insertions, 107 deletions
diff --git a/autoimport.py b/autoimport.py
index c1e0da5..8c61a18 100755
--- a/autoimport.py
+++ b/autoimport.py
@@ -4,11 +4,13 @@ packages contained. It has rather strong assumptions on the working directory.
 """
 
 import gzip
+import errno
 import io
 import multiprocessing
 import optparse
 import os
 import subprocess
+import tempfile
 import urllib
 
 import concurrent.futures
@@ -54,14 +56,14 @@ def process_dir(pkgs, d):
         except ValueError:
             pass
 
-def process_pkg(name, pkgdict):
+def process_pkg(name, pkgdict, outpath):
     filename = pkgdict["filename"]
     print("importing %s" % filename)
     importcmd = ["python", "importpkg.py"]
     if "sha256hash" in pkgdict:
         importcmd.extend(["-H", pkgdict["sha256hash"]])
     if filename.startswith("http://"):
-        with open(os.path.join("tmp", name), "w") as outp:
+        with open(outpath, "w") as outp:
             dl = subprocess.Popen(["curl", "-s", filename],
                                   stdout=subprocess.PIPE, close_fds=True)
             imp = subprocess.Popen(importcmd, stdin=dl.stdout, stdout=outp,
@@ -72,7 +74,7 @@ def process_pkg(name, pkgdict):
                 raise ValueError("curl failed")
     else:
         with open(filename) as inp:
-            with open(os.path.join("tmp", name), "w") as outp:
+            with open(outpath, "w") as outp:
                 subprocess.check_call(importcmd, stdin=inp, stdout=outp,
                                       close_fds=True)
     print("preprocessed %s" % name)
@@ -83,8 +85,11 @@ def main():
                       help="avoid reimporting same versions")
     parser.add_option("-p", "--prune", action="store_true",
                       help="prune packages old packages")
+    parser.add_option("-d", "--database", action="store",
+                      default="sqlite:///test.sqlite3",
+                      help="location of the database")
     options, args = parser.parse_args()
-    subprocess.check_call(["mkdir", "-p", "tmp"])
+    tmpdir = tempfile.mkdtemp(prefix=b"debian-dedup")
     db = sqlalchemy.create_engine("sqlite:///test.sqlite3")
     enable_sqlite_foreign_keys(db)
     e = concurrent.futures.ThreadPoolExecutor(multiprocessing.cpu_count())
@@ -113,14 +118,15 @@ def main():
     with e:
         fs = {}
         for name, pkg in pkgs.items():
-            fs[e.submit(process_pkg, name, pkg)] = name
+            outpath = os.path.join(tmpdir, name)
+            fs[e.submit(process_pkg, name, pkg, outpath)] = name
 
         for f in concurrent.futures.as_completed(fs.keys()):
             name = fs[f]
             if f.exception():
                 print("%s failed to import: %r" % (name, f.exception()))
                 continue
-            inf = os.path.join("tmp", name)
+            inf = os.path.join(tmpdir, name)
             print("sqlimporting %s" % name)
             with open(inf) as inp:
                 try:
@@ -140,6 +146,13 @@ def main():
                              [dict(name=pkg) for pkg in delpkgs])
                 # Tables content, dependency and sharing will also be pruned
                 # due to ON DELETE CASCADE clauses.
+    try:
+        os.rmdir(tmpdir)
+    except OSError as err:
+        if err.errno != errno.ENOTEMPTY:
+            raise
+        print("keeping temporary directory %s due to failed packages %s" %
+              (tmpdir, " ".join(os.listdir(tmpdir))))
 
 if __name__ == "__main__":
     main()
diff --git a/dedup/compression.py b/dedup/compression.py
index 869c49f..4ce258c 100644
--- a/dedup/compression.py
+++ b/dedup/compression.py
@@ -5,8 +5,11 @@ class GzipDecompressor(object):
     """An interface to gzip which is similar to bz2.BZ2Decompressor and
     lzma.LZMADecompressor."""
     def __init__(self):
+        self.sawheader = False
         self.inbuffer = b""
         self.decompressor = None
+        self.crc = 0
+        self.size = 0
 
     def decompress(self, data):
         """
@@ -16,6 +19,8 @@ class GzipDecompressor(object):
         while True:
             if self.decompressor:
                 data = self.decompressor.decompress(data)
+                self.crc = zlib.crc32(data, self.crc)
+                self.size += len(data)
                 unused_data = self.decompressor.unused_data
                 if not unused_data:
                     return data
@@ -45,13 +50,20 @@ class GzipDecompressor(object):
                 return b""
             data = self.inbuffer[skip:]
             self.inbuffer = b""
+            self.sawheader = True
             self.decompressor = zlib.decompressobj(-zlib.MAX_WBITS)
 
     @property
     def unused_data(self):
         if self.decompressor:
             return self.decompressor.unused_data
+        elif not self.sawheader:
+            return self.inbuffer
         else:
+            expect = struct.pack("<ll", self.crc, self.size)
+            if self.inbuffer.startswith(expect) and \
+                    self.inbuffer[len(expect):].replace("\0", "") == "":
+                return b""
             return self.inbuffer
 
     def flush(self):
@@ -67,6 +79,9 @@ class GzipDecompressor(object):
         new.inbuffer = self.inbuffer
         if self.decompressor:
             new.decompressor = self.decompressor.copy()
+        new.sawheader = self.sawheader
+        new.crc = self.crc
+        new.size = self.size
         return new
 
 class DecompressedStream(object):
diff --git a/dedup/debpkg.py b/dedup/debpkg.py
new file mode 100644
index 0000000..2d67135
--- /dev/null
+++ b/dedup/debpkg.py
@@ -0,0 +1,55 @@
+from debian import deb822
+
+from dedup.hashing import hash_file
+
+def process_control(control_contents):
+    """Parses the contents of a control file from a control.tar.gz of a Debian
+    package and returns a dictionary containing the fields relevant to dedup.
+    @type control_contents: bytes
+    @rtype: {str: object}
+    """
+    control = deb822.Packages(control_contents)
+    package = control["package"].encode("ascii")
+    try:
+        source = control["source"].encode("ascii").split()[0]
+    except KeyError:
+        source = package
+    version = control["version"].encode("ascii")
+    architecture = control["architecture"].encode("ascii")
+    # deb822 currently returns :any dependencies raw. see #670679
+    depends = set(dep[0]["name"].split(u':', 1)[0].encode("ascii")
+                  for dep in control.relations.get("depends", ())
+                  if len(dep) == 1)
+    return dict(package=package, source=source, version=version,
+                architecture=architecture, depends=depends)
+
+class MultiHash(object):
+    def __init__(self, *hashes):
+        self.hashes = hashes
+
+    def update(self, data):
+        for hasher in self.hashes:
+            hasher.update(data)
+
+def get_tar_hashes(tar, hash_functions):
+    """Given a TarFile read all regular files and compute all of the given hash
+    functions on each file.
+    @type tar: tarfile.TarFile
+    @param hash_functions: a sequence of parameter-less functions each creating a
+            new hashlib-like object
+    @rtype: gen((str, int, {str: str}}
+    @returns: an iterable of (filename, filesize, hashes) tuples where
+            hashes is a dict mapping hash function names to hash values
+    """
+
+    for elem in tar:
+        if not elem.isreg(): # excludes hard links as well
+            continue
+        hasher = MultiHash(*[func() for func in hash_functions])
+        hasher = hash_file(hasher, tar.extractfile(elem))
+        hashes = {}
+        for hashobj in hasher.hashes:
+            hashvalue = hashobj.hexdigest()
+            if hashvalue:
+                hashes[hashobj.name] = hashvalue
+        yield (elem.name, elem.size, hashes)
diff --git a/dedup/hashing.py b/dedup/hashing.py
index 002eda8..a8a46c7 100644
--- a/dedup/hashing.py
+++ b/dedup/hashing.py
@@ -1,3 +1,5 @@
+import itertools
+
 class HashBlacklist(object):
     """Turn a hashlib-like object into a hash that returns None for some
     blacklisted hashes instead of the real hash value.
@@ -29,6 +31,54 @@ class HashBlacklist(object):
     def copy(self):
         return HashBlacklist(self.hashobj.copy(), self.blacklist)
 
+class HashBlacklistContent(object):
+    """Turn a hashlib-like object into a hash that returns None for some
+    blacklisted content instead of the real hash value. Unlike HashBlacklist,
+    not the output of the hash is considered, but its input."""
+
+    def __init__(self, hashobj, blacklist=(), maxlen=None):
+        """
+        @param hashobj: a hashlib-like object
+        @param blacklist: an object providing __contains__.
+            hash inputs which are contained in the blacklist
+            are turned into None values
+        @param maxlen: the maximum length of a blacklisted input.
+            Defaults to max(map(len, blacklist)), so if it is absent,
+            the blacklist must support iteration.
+        """
+        self.hashobj = hashobj
+        self.blacklist = blacklist
+        if maxlen is None:
+            # the chain avoids passing the empty sequence to max
+            maxlen = max(itertools.chain((0,), itertools.imap(len, blacklist)))
+        self.maxlen = maxlen
+        self.stored = ""
+
+    @property
+    def name(self):
+        return self.hashobj.name
+
+    def update(self, data):
+        if self.stored is not None:
+            self.stored += data
+            if len(self.stored) > self.maxlen:
+                self.stored = None
+        self.hashobj.update(data)
+
+    def digest(self):
+        if self.stored is not None and self.stored in self.blacklist:
+            return None
+        return self.hashobj.digest()
+
+    def hexdigest(self):
+        if self.stored is not None and self.stored in self.blacklist:
+            return None
+        return self.hashobj.hexdigest()
+
+    def copy(self):
+        return HashBlacklistContent(self.hashobj.copy(), self.blacklist,
+                                    self.maxlen)
+
 class DecompressedHash(object):
     """Apply a decompression function before the hash. This class provides the
     hashlib interface (update, hexdigest, copy) excluding digest and name."""
@@ -49,9 +99,13 @@ class DecompressedHash(object):
 
     def hexdigest(self):
         if not hasattr(self.decompressor, "flush"):
+            if self.decompressor.unused_data:
+                raise ValueError("decompressor did not consume all data")
             return self.hashobj.hexdigest()
         tmpdecomp = self.decompressor.copy()
         data = tmpdecomp.flush()
+        if tmpdecomp.unused_data:
+            raise ValueError("decompressor did not consume all data")
         tmphash = self.hashobj.copy()
         tmphash.update(data)
         return tmphash.hexdigest()
@@ -61,7 +115,7 @@ class DecompressedHash(object):
 
 class SuppressingHash(object):
     """A hash that silences exceptions from the update and hexdigest methods of
-    a hashlib-like object. If an exception has occured, hexdigest always
+    a hashlib-like object. If an exception has occurred, hexdigest always
     returns None."""
     def __init__(self, hashobj, exceptions=()):
         """
diff --git a/dedup/image.py b/dedup/image.py
index c1f2de0..ef17989 100644
--- a/dedup/image.py
+++ b/dedup/image.py
@@ -4,7 +4,7 @@ import struct
 import PIL.Image
 
 class ImageHash(object):
-    """A hash on the contents of an image datat type supported by PIL. This
+    """A hash on the contents of an image data type supported by PIL. This
     disregards mode, depth and meta information. Note that due to limitations
     in PIL and the image format (interlacing) the full contents are stored and
     decoded in hexdigest."""
diff --git a/dedup/templates/base.html b/dedup/templates/base.html
index 62f4087..9dfb788 100644
--- a/dedup/templates/base.html
+++ b/dedup/templates/base.html
@@ -3,8 +3,8 @@
 	<head>
 		<title>{% block title %}{% endblock %}</title>
 		<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
-		<link  rel="stylesheet" type="text/css" href="{{ urlroot|e }}/style.css">
-		<link rel="icon" type="image/vnd.microsoft.icon" href="{{ urlroot|e }}/favicon.ico">
+		<link rel="stylesheet" type="text/css" href="{{ urlroot|e }}/static/style.css">
+		<link rel="icon" type="image/vnd.microsoft.icon" href="{{ urlroot|e }}/static/favicon.ico">
 		{% block header %}{% endblock %}
 	</head>
 	<body>
diff --git a/dedup/templates/binary.html b/dedup/templates/binary.html
index 69eceef..46c4fa6 100644
--- a/dedup/templates/binary.html
+++ b/dedup/templates/binary.html
@@ -21,7 +21,7 @@
 <p>Note: Packages with yellow background are required to be installed when this package is installed.</p>
 {%- endif -%}
 {%- if issues -%}
-    <h3>issues with particular files</h3>
+    <h3 id="issues">issues with particular files</h3>
     <table border='1'><tr><th>filename</th><th>issue</th></tr>
     {%- for filename, issue in issues|dictsort(true) -%}
         <tr><td><span class="filename">{{ filename|e }}</span></td><td>{{ issue|e }}</td></tr>
diff --git a/dedup/templates/index.html b/dedup/templates/index.html
index 7c9000f..169027e 100644
--- a/dedup/templates/index.html
+++ b/dedup/templates/index.html
@@ -28,7 +28,7 @@
 {% block content %}
 <h1>Debian duplication detector</h1>
 <ul>
-<li>To inspect a particlar binary package, go to <pre>binary/&lt;packagename&gt;</pre> Example: <a href="binary/git">binary/git</a>
+<li>To inspect a particular binary package, go to <pre>binary/&lt;packagename&gt;</pre> Example: <a href="binary/git">binary/git</a>
     <div style="display:none" id="form_div"><fieldset>
             <legend>Inspect package</legend>
             <noscript><b>This form is dysfunctional when javascript is not enabled</b></noscript>
diff --git a/dedup/utils.py b/dedup/utils.py
index 6864ad3..fd30378 100644
--- a/dedup/utils.py
+++ b/dedup/utils.py
@@ -1,3 +1,4 @@
+from debian.debian_support import version_compare
 import sqlalchemy.event
 
 def fetchiter(cursor):
@@ -12,3 +13,8 @@ def enable_sqlite_foreign_keys(engine):
     def pragma_foreign_keys(connection, _):
         connection.execute("PRAGMA foreign_keys=ON;")
 
+def sqlite_add_version_compare(engine):
+    @sqlalchemy.event.listens_for(engine, "connect")
+    def add_version_compare(connection, _):
+        connection.create_collation("debian_version", version_compare)
+        connection.create_function("debian_version_compare", 2, version_compare)
diff --git a/importpkg.py b/importpkg.py
index 182ca01..aeccda5 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -1,7 +1,7 @@
 #!/usr/bin/python
-"""This tool reads a debian package from stdin and emits a yaml stream on
+"""This tool reads a Debian package from stdin and emits a yaml stream on
 stdout.  It does not access a database. Therefore it can be run in parallel and
-on multiple machines. The generated yaml conatins multiple documents. The first
+on multiple machines. The generated yaml contains multiple documents. The first
 document contains package metadata. Then a document is emitted for each file.
 And finally a document consisting of the string "commit" is emitted."""
 
@@ -11,38 +11,26 @@ import sys
 import tarfile
 import zlib
 
-from debian import deb822
 import lzma
 import yaml
 
 from dedup.arreader import ArReader
-from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \
-    HashedStream, hash_file
+from dedup.debpkg import process_control, get_tar_hashes
+from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \
+        HashBlacklistContent
 from dedup.compression import GzipDecompressor, DecompressedStream
 from dedup.image import GIFHash, PNGHash
 
-class MultiHash(object):
-    def __init__(self, *hashes):
-        self.hashes = hashes
-
-    def update(self, data):
-        for hasher in self.hashes:
-            hasher.update(data)
-
-boring_sha512_hashes = set((
-    # ""
-    "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e",
-    # "\n"
-    "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09"))
+boring_content = set(("", "\n"))
 
 def sha512_nontrivial():
-    return HashBlacklist(hashlib.sha512(), boring_sha512_hashes)
+    return HashBlacklistContent(hashlib.sha512(), boring_content)
 
 def gziphash():
     hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
     hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
     hashobj.name = "gzip_sha512"
-    return HashBlacklist(hashobj, boring_sha512_hashes)
+    return HashBlacklistContent(hashobj, boring_content)
 
 def pnghash():
     hashobj = PNGHash(hashlib.sha512())
@@ -56,37 +44,7 @@ def gifhash():
     hashobj.name = "gif_sha512"
     return hashobj
 
-def get_hashes(tar):
-    for elem in tar:
-        if not elem.isreg(): # excludes hard links as well
-            continue
-        hasher = MultiHash(sha512_nontrivial(), gziphash(), pnghash(),
-                           gifhash())
-        hasher = hash_file(hasher, tar.extractfile(elem))
-        hashes = {}
-        for hashobj in hasher.hashes:
-            hashvalue = hashobj.hexdigest()
-            if hashvalue:
-                hashes[hashobj.name] = hashvalue
-        yield (elem.name, elem.size, hashes)
-
-def process_control(control_contents):
-    control = deb822.Packages(control_contents)
-    package = control["package"].encode("ascii")
-    try:
-        source = control["source"].encode("ascii").split()[0]
-    except KeyError:
-        source = package
-    version = control["version"].encode("ascii")
-    architecture = control["architecture"].encode("ascii")
-
-    depends = set(dep[0]["name"].encode("ascii")
-                  for dep in control.relations.get("depends", ())
-                  if len(dep) == 1)
-    return dict(package=package, source=source, version=version,
-                architecture=architecture, depends=depends)
-
-def process_package(filelike):
+def process_package(filelike, hash_functions):
     af = ArReader(filelike)
     af.read_magic()
     state = "start"
@@ -122,7 +80,7 @@ def process_package(filelike):
             continue
         if state != "control_file":
             raise ValueError("missing control file")
-        for name, size, hashes in get_hashes(tf):
+        for name, size, hashes in get_tar_hashes(tf, hash_functions):
             try:
                 name = name.decode("utf8")
             except UnicodeDecodeError:
@@ -132,9 +90,9 @@ def process_package(filelike):
         yield "commit"
         break
 
-def process_package_with_hash(filelike, sha256hash):
+def process_package_with_hash(filelike, hash_functions, sha256hash):
     hstream = HashedStream(filelike, hashlib.sha256())
-    for elem in process_package(hstream):
+    for elem in process_package(hstream, hash_functions):
         if elem == "commit":
             while hstream.read(4096):
                 pass
@@ -149,10 +107,11 @@ def main():
     parser.add_option("-H", "--hash", action="store",
                       help="verify that stdin hash given sha256 hash")
     options, args = parser.parse_args()
+    hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash]
     if options.hash:
-        gen = process_package_with_hash(sys.stdin, options.hash)
+        gen = process_package_with_hash(sys.stdin, hash_functions, options.hash)
     else:
-        gen = process_package(sys.stdin)
+        gen = process_package(sys.stdin, hash_functions)
     yaml.safe_dump_all(gen, sys.stdout)
 
 if __name__ == "__main__":
diff --git a/readyaml.py b/readyaml.py
index 835967d..15cfcb3 100755
--- a/readyaml.py
+++ b/readyaml.py
@@ -2,6 +2,7 @@
 """This tool reads a yaml file as generated by importpkg.py on stdin and
 updates the database with the contents."""
 
+import optparse
 import sys
 
 from debian.debian_support import version_compare
@@ -57,7 +58,12 @@ def readyaml(conn, stream):
     raise ValueError("missing commit block")
 
 def main():
-    db = sqlalchemy.create_engine("sqlite:///test.sqlite3")
+    parser = optparse.OptionParser()
+    parser.add_option("-d", "--database", action="store",
+                      default="sqlite:///test.sqlite3",
+                      help="location of the database")
+    options, args = parser.parse_args()
+    db = sqlalchemy.create_engine(options.database)
     enable_sqlite_foreign_keys(db)
     with db.begin() as conn:
         readyaml(conn, sys.stdin)
diff --git a/schema.sql b/schema.sql
index ddc6ccd..f6b1a7c 100644
--- a/schema.sql
+++ b/schema.sql
@@ -1,7 +1,7 @@
 CREATE TABLE package (id INTEGER PRIMARY KEY, name TEXT UNIQUE, version TEXT, architecture TEXT, source TEXT);
 CREATE TABLE content (id INTEGER PRIMARY KEY, pid INTEGER, filename TEXT, size INTEGER, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE);
-CREATE TABLE function (id INTEGER PRIMARY KEY, name TEXT UNIQUE NOT NULL);
-INSERT INTO function (name) VALUES ("sha512"), ("gzip_sha512"), ("png_sha512"), ("gif_sha512");
+CREATE TABLE function (id INTEGER PRIMARY KEY, name TEXT UNIQUE NOT NULL, eqclass INTEGER);
+INSERT INTO function (id, name, eqclass) VALUES (1, 'sha512', 1), (2, 'gzip_sha512', 1), (3, 'png_sha512', 2), (4, 'gif_sha512', 2);
 CREATE TABLE hash (cid INTEGER, fid INTEGER NOT NULL, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE, FOREIGN KEY (fid) REFERENCES function(id));
 CREATE TABLE dependency (pid INTEGER, required TEXT, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE);
 CREATE INDEX content_package_size_index ON content (pid, size);
diff --git a/update_sharing.py b/update_sharing.py
index 6fd83f8..450bfc7 100755
--- a/update_sharing.py
+++ b/update_sharing.py
@@ -1,5 +1,7 @@
 #!/usr/bin/python
 
+import optparse
+
 import sqlalchemy
 
 from dedup.utils import fetchiter, enable_sqlite_foreign_keys
@@ -39,30 +41,42 @@ def process_pkgdict(conn, pkgdict):
                     insert_key = (pid1, pid2, fid1, fid2)
                     add_values(conn, insert_key, pkgnumfiles, pkgsize)
 
-def main():
-    db = sqlalchemy.create_engine("sqlite:///test.sqlite3")
-    enable_sqlite_foreign_keys(db)
+def main(db):
     with db.begin() as conn:
         conn.execute("DELETE FROM sharing;")
         conn.execute("DELETE FROM duplicate;")
         conn.execute("DELETE FROM issue;")
         readcur = conn.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;")
         for hashvalue, in fetchiter(readcur):
-            rows = conn.execute(sqlalchemy.text("SELECT content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id WHERE hash = :hashvalue;"),
+            rows = conn.execute(sqlalchemy.text("SELECT function.eqclass, content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id JOIN function ON hash.fid = function.id AND function.eqclass IS NOT NULL WHERE hash = :hashvalue;"),
                                 hashvalue=hashvalue).fetchall()
-            print("processing hash %s with %d entries" % (hashvalue, len(rows)))
-            pkgdict = compute_pkgdict(rows)
+            rowdict = dict()
             for row in rows:
-                cid = row[1]
-                already = conn.scalar(sqlalchemy.text("SELECT cid FROM duplicate WHERE cid = :cid;"),
-                                      cid=cid)
-                if not already:
-                    conn.execute(sqlalchemy.text("INSERT INTO duplicate (cid) VALUES (:cid);"),
-                                 cid=cid)
-            process_pkgdict(conn, pkgdict)
+                rowdict.setdefault(row[0], []).append(row[1:])
+            for eqclass, rows in rowdict.items():
+                if len(rows) < 2:
+                    print("skipping hash %s class %d with too few entries" % (hashvalue, eqclass))
+                    continue
+                print("processing hash %s class %d with %d entries" % (hashvalue, eqclass, len(rows)))
+                pkgdict = compute_pkgdict(rows)
+                for row in rows:
+                    cid = row[1]
+                    already = conn.scalar(sqlalchemy.text("SELECT cid FROM duplicate WHERE cid = :cid;"),
+                                          cid=cid)
+                    if not already:
+                        conn.execute(sqlalchemy.text("INSERT INTO duplicate (cid) VALUES (:cid);"),
+                                     cid=cid)
+                process_pkgdict(conn, pkgdict)
         conn.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');")
         conn.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';")
         conn.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';")
 
 if __name__ == "__main__":
-    main()
+    parser = optparse.OptionParser()
+    parser.add_option("-d", "--database", action="store",
+                      default="sqlite:///test.sqlite3",
+                      help="location of the database")
+    options, args = parser.parse_args()
+    db = sqlalchemy.create_engine(options.database)
+    enable_sqlite_foreign_keys(db)
+    main(db)
diff --git a/webapp.py b/webapp.py
index bbc45e1..f1a0df3 100755
--- a/webapp.py
+++ b/webapp.py
@@ -1,6 +1,7 @@
 #!/usr/bin/python
 
 import datetime
+import optparse
 from wsgiref.simple_server import make_server
 
 import jinja2
@@ -12,16 +13,6 @@ from werkzeug.wsgi import SharedDataMiddleware
 
 from dedup.utils import fetchiter
 
-hash_functions = [
-        ("sha512", "sha512"),
-        ("png_sha512", "png_sha512"),
-        ("png_sha512", "gif_sha512"),
-        ("gif_sha512", "png_sha512"),
-        ("gif_sha512", "gif_sha512"),
-        ("gzip_sha512", "gzip_sha512"),
-        ("sha512", "gzip_sha512"),
-        ("gzip_sha512", "sha512")]
-
 jinjaenv = jinja2.Environment(loader=jinja2.PackageLoader("dedup", "templates"))
 
 def format_size(size):
@@ -134,11 +125,9 @@ class Application(object):
     def cached_sharedstats(self, pid):
         sharedstats = {}
         with self.db.begin() as conn:
-            cur = conn.execute(sqlalchemy.text("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = :pid;"),
+            cur = conn.execute(sqlalchemy.text("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = :pid AND f1.eqclass = f2.eqclass;"),
                                pid=pid)
             for pid2, package2, func1, func2, files, size in fetchiter(cur):
-                if (func1, func2) not in hash_functions:
-                    continue
                 curstats = sharedstats.setdefault(
                         function_combination(func1, func2), list())
                 if pid2 == pid:
@@ -159,7 +148,7 @@ class Application(object):
         return html_response(package_template.render(params))
 
     def compute_comparison(self, pid1, pid2):
-        """Compute a sequence of comparison objects ordery by the size of the
+        """Compute a sequence of comparison objects ordered by the size of the
         object in the first package. Each element of the sequence is a dict
         defining the following keys:
          * filenames: A set of filenames in package 1 (pid1) all referring to
@@ -189,9 +178,9 @@ class Application(object):
                 entry = dict(filenames=set((filename,)), size=size, matches={})
                 files[hashvalue] = entry
 
-                cur = conn.execute(sqlalchemy.text("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = :cid AND pid = :pid;"),
+                cur2 = conn.execute(sqlalchemy.text("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = :cid AND pid = :pid AND fa.eqclass = fb.eqclass;"),
                                    cid=cid, pid=pid2)
-                for func1, hashvalue, func2, filename in fetchiter(cur):
+                for func1, hashvalue, func2, filename in fetchiter(cur2):
                     entry["matches"].setdefault(filename, {})[func1, func2] = \
                             hashvalue
 
@@ -214,12 +203,11 @@ class Application(object):
 
     def show_hash(self, function, hashvalue):
         with self.db.begin() as conn:
-            cur = conn.execute(sqlalchemy.text("SELECT package.name, content.filename, content.size, function.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function ON hash.fid = function.id WHERE hash = :hashvalue;"),
-                               hashvalue=hashvalue)
+            cur = conn.execute(sqlalchemy.text("SELECT package.name, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = :function AND hash = :hashvalue;"),
+                               function=function, hashvalue=hashvalue)
             entries = [dict(package=package, filename=filename, size=size,
                             function=otherfunc)
-                       for package, filename, size, otherfunc in fetchiter(cur)
-                       if (function, otherfunc) in hash_functions]
+                       for package, filename, size, otherfunc in fetchiter(cur)]
         if not entries:
             raise NotFound()
         params = dict(function=function, hashvalue=hashvalue, entries=entries,
@@ -246,9 +234,14 @@ class Application(object):
         return html_response(source_template.render(params))
 
 def main():
-    db = sqlalchemy.create_engine("sqlite:///test.sqlite3")
+    parser = optparse.OptionParser()
+    parser.add_option("-d", "--database", action="store",
+                      default="sqlite:///test.sqlite3",
+                      help="location of the database")
+    options, args = parser.parse_args()
+    db = sqlalchemy.create_engine(options.database)
     app = Application(db)
-    app = SharedDataMiddleware(app, {"/": ("dedup", "static")})
+    app = SharedDataMiddleware(app, {"/static": ("dedup", "static")})
     make_server("0.0.0.0", 8800, app).serve_forever()
 
 if __name__ == "__main__":
author	Helmut Grohne <helmut@subdivi.de>	2014-03-08 12:39:32 +0100
committer	Helmut Grohne <helmut@subdivi.de>	2014-03-08 12:39:32 +0100
commit	c6a30cefff55cd247a47fa0a2d4f819592e1202b (patch)
tree	58b6ff52bc6827782c2973f1ce976e245ce5f34c
parent	751f19ec1107c9059ae4834e4b757741ebee6cbd (diff)
parent	bb0aea9971bc79d8787d8f034022d0ca803fcab3 (diff)
download	debian-dedup-c6a30cefff55cd247a47fa0a2d4f819592e1202b.tar.gz