From d228c0a4a5827325bca47d63ea287c7cb56537ea Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Thu, 3 Oct 2013 08:51:41 +0200
Subject: work around python-debian's #670679

---
 dedup/debpkg.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'dedup')

diff --git a/dedup/debpkg.py b/dedup/debpkg.py
index d8cc22f..2d67135 100644
--- a/dedup/debpkg.py
+++ b/dedup/debpkg.py
@@ -16,8 +16,8 @@ def process_control(control_contents):
         source = package
     version = control["version"].encode("ascii")
     architecture = control["architecture"].encode("ascii")
-
-    depends = set(dep[0]["name"].encode("ascii")
+    # deb822 currently returns :any dependencies raw. see #670679
+    depends = set(dep[0]["name"].split(u':', 1)[0].encode("ascii")
                   for dep in control.relations.get("depends", ())
                   if len(dep) == 1)
     return dict(package=package, source=source, version=version,
-- 
cgit v1.2.3


From 17597b5e828f9bbc9b0159102b173c284c23a140 Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Wed, 19 Feb 2014 07:54:21 +0100
Subject: DecompressedHash should fail on trailing input

Otherwise all files smaller than 10 bytes are successfully hashed to the
hash of the empty input when using the GzipDecompressor.

Reported-By: Olly Betts
---
 dedup/hashing.py | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'dedup')

diff --git a/dedup/hashing.py b/dedup/hashing.py
index 002eda8..5f015b2 100644
--- a/dedup/hashing.py
+++ b/dedup/hashing.py
@@ -49,9 +49,13 @@ class DecompressedHash(object):
 
     def hexdigest(self):
         if not hasattr(self.decompressor, "flush"):
+            if self.decompressor.unused_data:
+                raise ValueError("decompressor did not consume all data")
             return self.hashobj.hexdigest()
         tmpdecomp = self.decompressor.copy()
         data = tmpdecomp.flush()
+        if tmpdecomp.unused_data:
+            raise ValueError("decompressor did not consume all data")
         tmphash = self.hashobj.copy()
         tmphash.update(data)
         return tmphash.hexdigest()
-- 
cgit v1.2.3


From d467a2a4e85d4b6f09bd2e3dc70466bfcc45a577 Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Wed, 19 Feb 2014 14:19:56 +0100
Subject: GzipDecompressor: don't treat checksum as garbage trailer

---
 dedup/compression.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'dedup')

diff --git a/dedup/compression.py b/dedup/compression.py
index 869c49f..4ce258c 100644
--- a/dedup/compression.py
+++ b/dedup/compression.py
@@ -5,8 +5,11 @@ class GzipDecompressor(object):
     """An interface to gzip which is similar to bz2.BZ2Decompressor and
     lzma.LZMADecompressor."""
     def __init__(self):
+        self.sawheader = False
         self.inbuffer = b""
         self.decompressor = None
+        self.crc = 0
+        self.size = 0
 
     def decompress(self, data):
         """
@@ -16,6 +19,8 @@ class GzipDecompressor(object):
         while True:
             if self.decompressor:
                 data = self.decompressor.decompress(data)
+                self.crc = zlib.crc32(data, self.crc)
+                self.size += len(data)
                 unused_data = self.decompressor.unused_data
                 if not unused_data:
                     return data
@@ -45,13 +50,20 @@ class GzipDecompressor(object):
                 return b""
             data = self.inbuffer[skip:]
             self.inbuffer = b""
+            self.sawheader = True
             self.decompressor = zlib.decompressobj(-zlib.MAX_WBITS)
 
     @property
     def unused_data(self):
         if self.decompressor:
             return self.decompressor.unused_data
+        elif not self.sawheader:
+            return self.inbuffer
         else:
+            expect = struct.pack("<ll", self.crc, self.size)
+            if self.inbuffer.startswith(expect) and \
+                    self.inbuffer[len(expect):].replace("\0", "") == "":
+                return b""
             return self.inbuffer
 
     def flush(self):
@@ -67,6 +79,9 @@ class GzipDecompressor(object):
         new.inbuffer = self.inbuffer
         if self.decompressor:
             new.decompressor = self.decompressor.copy()
+        new.sawheader = self.sawheader
+        new.crc = self.crc
+        new.size = self.size
         return new
 
 class DecompressedStream(object):
-- 
cgit v1.2.3


From 332ac9eafb235443f163c606ced95dcbd615815e Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Wed, 19 Feb 2014 14:21:20 +0100
Subject: blacklist content rather than hashes

Otherwise the gzip hash cannot tell the empty stream and the
compressed empty stream apart.
---
 dedup/hashing.py | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 importpkg.py     | 15 +++++----------
 2 files changed, 55 insertions(+), 10 deletions(-)

(limited to 'dedup')

diff --git a/dedup/hashing.py b/dedup/hashing.py
index 5f015b2..70f6268 100644
--- a/dedup/hashing.py
+++ b/dedup/hashing.py
@@ -1,3 +1,5 @@
+import itertools
+
 class HashBlacklist(object):
     """Turn a hashlib-like object into a hash that returns None for some
     blacklisted hashes instead of the real hash value.
@@ -29,6 +31,54 @@ class HashBlacklist(object):
     def copy(self):
         return HashBlacklist(self.hashobj.copy(), self.blacklist)
 
+class HashBlacklistContent(object):
+    """Turn a hashlib-like object into a hash that returns None for some
+    blacklisted content instead of the real hash value. Unlike HashBlacklist,
+    not the output of the hash is considered, but its input."""
+
+    def __init__(self, hashobj, blacklist=(), maxlen=None):
+        """
+        @param hashobj: a hashlib-like object
+        @param blacklist: an object providing __contains__.
+            hash inputs which are contained in the blacklist
+            are turned into None values
+        @param maxlen: the maximum length of a blacklisted input.
+            Defaults to max(map(len, blacklist)), so if it is absent,
+            the blacklist must support iteration.
+        """
+        self.hashobj = hashobj
+        self.blacklist = blacklist
+        if maxlen is None:
+            # the chain avoids passing the empty sequence to max
+            maxlen = max(itertools.chain((0,), itertools.imap(len, blacklist)))
+        self.maxlen = maxlen
+        self.stored = ""
+
+    @property
+    def name(self):
+        return self.hashobj.name
+
+    def update(self, data):
+        if self.stored is not None:
+            self.stored += data
+            if len(self.stored) > self.maxlen:
+                self.stored = None
+        self.hashobj.update(data)
+
+    def digest(self):
+        if self.stored is not None and self.stored in self.blacklist:
+            return None
+        return self.hashobj.digest()
+
+    def hexdigest(self):
+        if self.stored is not None and self.stored in self.blacklist:
+            return None
+        return self.hashobj.hexdigest()
+
+    def copy(self):
+        return HashBlacklistContent(self.hashobj.copy(), self.blacklist,
+                                    self.maxlen)
+
 class DecompressedHash(object):
     """Apply a decompression function before the hash. This class provides the
     hashlib interface (update, hexdigest, copy) excluding digest and name."""
diff --git a/importpkg.py b/importpkg.py
index 54f6181..cb16f97 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -16,26 +16,21 @@ import yaml
 
 from dedup.arreader import ArReader
 from dedup.debpkg import process_control, get_tar_hashes
-from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \
-    HashedStream
+from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \
+        HashBlacklistContent
 from dedup.compression import GzipDecompressor, DecompressedStream
 from dedup.image import GIFHash, PNGHash
 
-boring_sha512_hashes = set((
-    # ""
-    "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e",
-    # "\n"
-    "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09"))
+boring_content = set(("", "\n"))
 
 def sha512_nontrivial():
-    return HashBlacklist(hashlib.sha512(), boring_sha512_hashes)
+    return HashBlacklistContent(hashlib.sha512(), boring_content)
 
 def gziphash():
     hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
     hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
     hashobj.name = "gzip_sha512"
-    # don't blacklist boring hashes for gzip to get gzip issues right
-    return hashobj
+    return HashBlacklistContent(hashobj, boring_content)
 
 def pnghash():
     hashobj = PNGHash(hashlib.sha512())
-- 
cgit v1.2.3


From 8ccd5205f77276b333c56efb8271a0ddf11590a0 Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Sun, 23 Feb 2014 17:29:41 +0100
Subject: fix spelling mistake

Reported-By: Stefan Kaltenbrunner
---
 dedup/templates/index.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'dedup')

diff --git a/dedup/templates/index.html b/dedup/templates/index.html
index 7c9000f..169027e 100644
--- a/dedup/templates/index.html
+++ b/dedup/templates/index.html
@@ -28,7 +28,7 @@
 {% block content %}
 <h1>Debian duplication detector</h1>
 <ul>
-<li>To inspect a particlar binary package, go to <pre>binary/&lt;packagename&gt;</pre> Example: <a href="binary/git">binary/git</a>
+<li>To inspect a particular binary package, go to <pre>binary/&lt;packagename&gt;</pre> Example: <a href="binary/git">binary/git</a>
     <div style="display:none" id="form_div"><fieldset>
             <legend>Inspect package</legend>
             <noscript><b>This form is dysfunctional when javascript is not enabled</b></noscript>
-- 
cgit v1.2.3


From 8d4c5512edbdcdd1063a7e6508f398a5a57981be Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Sun, 23 Feb 2014 18:19:35 +0100
Subject: spell check comments

---
 dedup/hashing.py | 2 +-
 dedup/image.py   | 2 +-
 importpkg.py     | 4 ++--
 webapp.py        | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'dedup')

diff --git a/dedup/hashing.py b/dedup/hashing.py
index 70f6268..a8a46c7 100644
--- a/dedup/hashing.py
+++ b/dedup/hashing.py
@@ -115,7 +115,7 @@ class DecompressedHash(object):
 
 class SuppressingHash(object):
     """A hash that silences exceptions from the update and hexdigest methods of
-    a hashlib-like object. If an exception has occured, hexdigest always
+    a hashlib-like object. If an exception has occurred, hexdigest always
     returns None."""
     def __init__(self, hashobj, exceptions=()):
         """
diff --git a/dedup/image.py b/dedup/image.py
index c1f2de0..ef17989 100644
--- a/dedup/image.py
+++ b/dedup/image.py
@@ -4,7 +4,7 @@ import struct
 import PIL.Image
 
 class ImageHash(object):
-    """A hash on the contents of an image datat type supported by PIL. This
+    """A hash on the contents of an image data type supported by PIL. This
     disregards mode, depth and meta information. Note that due to limitations
     in PIL and the image format (interlacing) the full contents are stored and
     decoded in hexdigest."""
diff --git a/importpkg.py b/importpkg.py
index cb16f97..aeccda5 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -1,7 +1,7 @@
 #!/usr/bin/python
-"""This tool reads a debian package from stdin and emits a yaml stream on
+"""This tool reads a Debian package from stdin and emits a yaml stream on
 stdout.  It does not access a database. Therefore it can be run in parallel and
-on multiple machines. The generated yaml conatins multiple documents. The first
+on multiple machines. The generated yaml contains multiple documents. The first
 document contains package metadata. Then a document is emitted for each file.
 And finally a document consisting of the string "commit" is emitted."""
 
diff --git a/webapp.py b/webapp.py
index fd6d685..2fd69bb 100755
--- a/webapp.py
+++ b/webapp.py
@@ -151,7 +151,7 @@ class Application(object):
         return html_response(package_template.render(params))
 
     def compute_comparison(self, pid1, pid2):
-        """Compute a sequence of comparison objects ordery by the size of the
+        """Compute a sequence of comparison objects ordered by the size of the
         object in the first package. Each element of the sequence is a dict
         defining the following keys:
          * filenames: A set of filenames in package 1 (pid1) all referring to
-- 
cgit v1.2.3


From cb3900603b79731891adbe44a1a1b3eb19f16cad Mon Sep 17 00:00:00 2001
From: Guillem Jover <guillem@debian.org>
Date: Thu, 8 May 2014 01:46:21 +0200
Subject: dedup.arreader: remove trailing slash from ar members

The GNU ar format adds a trailing slash to the member names, normalize
the member names to take this into account.

Signed-off-by: Guillem Jover <guillem@debian.org>
---
 dedup/arreader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'dedup')

diff --git a/dedup/arreader.py b/dedup/arreader.py
index d74ae37..e53efd9 100644
--- a/dedup/arreader.py
+++ b/dedup/arreader.py
@@ -45,7 +45,7 @@ class ArReader(object):
         if not file_header:
             raise EOFError("end of archive found")
         parts = struct.unpack("16s 12s 6s 6s 8s 10s 2s", file_header)
-        parts = [p.rstrip(b" ") for p in parts]
+        parts = [p.rstrip(b"/ ") for p in parts]
         if parts.pop() != self.file_magic:
             raise ValueError("ar file header not found")
         self.remaining = int(parts[5])
-- 
cgit v1.2.3


From ddaa08f7a63a1fedf4c1f2804873199dd5182142 Mon Sep 17 00:00:00 2001
From: Guillem Jover <guillem@debian.org>
Date: Wed, 7 May 2014 21:06:38 +0200
Subject: importpkg: add support for control.tar and control.tar.xz

dpkg supports those since 1.17.6.

Signed-off-by: Guillem Jover <guillem@debian.org>
---
 dedup/debpkg.py |  2 +-
 importpkg.py    | 61 +++++++++++++++++++++++++++++++++++----------------------
 2 files changed, 39 insertions(+), 24 deletions(-)

(limited to 'dedup')

diff --git a/dedup/debpkg.py b/dedup/debpkg.py
index 2d67135..dbee849 100644
--- a/dedup/debpkg.py
+++ b/dedup/debpkg.py
@@ -3,7 +3,7 @@ from debian import deb822
 from dedup.hashing import hash_file
 
 def process_control(control_contents):
-    """Parses the contents of a control file from a control.tar.gz of a Debian
+    """Parses the contents of a control file from a control.tar of a Debian
     package and returns a dictionary containing the fields relevant to dedup.
     @type control_contents: bytes
     @rtype: {str: object}
diff --git a/importpkg.py b/importpkg.py
index aeccda5..7482c4f 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -54,41 +54,56 @@ def process_package(filelike, hash_functions):
         except EOFError:
             raise ValueError("data.tar not found")
         if name == "control.tar.gz":
-            if state != "start":
-                raise ValueError("unexpected control.tar.gz")
-            state = "control"
+            new_state = "control"
             tf = tarfile.open(fileobj=af, mode="r|gz")
-            for elem in tf:
-                if elem.name != "./control":
-                    continue
-                if state != "control":
-                    raise ValueError("duplicate control file")
-                state = "control_file"
-                yield process_control(tf.extractfile(elem).read())
-                break
-            continue
+        elif name == "control.tar.xz":
+            new_state = "control"
+            zf = DecompressedStream(af, lzma.LZMADecompressor())
+            tf = tarfile.open(fileobj=zf, mode="r|")
+        elif name == "control.tar":
+            new_state = "control"
+            tf = tarfile.open(fileobj=af, mode="r|")
         elif name == "data.tar.gz":
+            new_state = "data"
             tf = tarfile.open(fileobj=af, mode="r|gz")
         elif name == "data.tar.bz2":
+            new_state = "data"
             tf = tarfile.open(fileobj=af, mode="r|bz2")
         elif name == "data.tar.xz":
+            new_state = "data"
             zf = DecompressedStream(af, lzma.LZMADecompressor())
             tf = tarfile.open(fileobj=zf, mode="r|")
         elif name == "data.tar":
+            new_state = "data"
             tf = tarfile.open(fileobj=af, mode="r|")
         else:
             continue
-        if state != "control_file":
-            raise ValueError("missing control file")
-        for name, size, hashes in get_tar_hashes(tf, hash_functions):
-            try:
-                name = name.decode("utf8")
-            except UnicodeDecodeError:
-                print("warning: skipping filename with encoding error")
-                continue # skip files with non-utf8 encoding for now
-            yield dict(name=name, size=size, hashes=hashes)
-        yield "commit"
-        break
+        if new_state == "control":
+            if state != "start":
+                raise ValueError("unexpected control.tar")
+            state = new_state
+            for elem in tf:
+                if elem.name != "./control":
+                    continue
+                if state != "control":
+                    raise ValueError("duplicate control file")
+                state = "control_file"
+                yield process_control(tf.extractfile(elem).read())
+                break
+            continue
+        elif new_state == "data":
+            if state != "control_file":
+                raise ValueError("missing control file")
+            state = new_state
+            for name, size, hashes in get_tar_hashes(tf, hash_functions):
+                try:
+                    name = name.decode("utf8")
+                except UnicodeDecodeError:
+                    print("warning: skipping filename with encoding error")
+                    continue # skip files with non-utf8 encoding for now
+                yield dict(name=name, size=size, hashes=hashes)
+            yield "commit"
+            break
 
 def process_package_with_hash(filelike, hash_functions, sha256hash):
     hstream = HashedStream(filelike, hashlib.sha256())
-- 
cgit v1.2.3