From 3134b18dd8e4932b03b87453e6ee4b4a93b5595f Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Mon, 2 Sep 2013 09:30:05 +0200
Subject: importpkg: move library-like parts to dedup.debpkg

---
 dedup/debpkg.py | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 dedup/debpkg.py

(limited to 'dedup/debpkg.py')

diff --git a/dedup/debpkg.py b/dedup/debpkg.py
new file mode 100644
index 0000000..d8cc22f
--- /dev/null
+++ b/dedup/debpkg.py
@@ -0,0 +1,55 @@
+from debian import deb822
+
+from dedup.hashing import hash_file
+
+def process_control(control_contents):
+    """Parses the contents of a control file from a control.tar.gz of a Debian
+    package and returns a dictionary containing the fields relevant to dedup.
+    @type control_contents: bytes
+    @rtype: {str: object}
+    """
+    control = deb822.Packages(control_contents)
+    package = control["package"].encode("ascii")
+    try:
+        source = control["source"].encode("ascii").split()[0]
+    except KeyError:
+        source = package
+    version = control["version"].encode("ascii")
+    architecture = control["architecture"].encode("ascii")
+
+    depends = set(dep[0]["name"].encode("ascii")
+                  for dep in control.relations.get("depends", ())
+                  if len(dep) == 1)
+    return dict(package=package, source=source, version=version,
+                architecture=architecture, depends=depends)
+
+class MultiHash(object):
+    def __init__(self, *hashes):
+        self.hashes = hashes
+
+    def update(self, data):
+        for hasher in self.hashes:
+            hasher.update(data)
+
+def get_tar_hashes(tar, hash_functions):
+    """Given a TarFile read all regular files and compute all of the given hash
+    functions on each file.
+    @type tar: tarfile.TarFile
+    @param hash_functions: a sequence of parameter-less functions each creating a
+            new hashlib-like object
+    @rtype: gen((str, int, {str: str}}
+    @returns: an iterable of (filename, filesize, hashes) tuples where
+            hashes is a dict mapping hash function names to hash values
+    """
+
+    for elem in tar:
+        if not elem.isreg(): # excludes hard links as well
+            continue
+        hasher = MultiHash(*[func() for func in hash_functions])
+        hasher = hash_file(hasher, tar.extractfile(elem))
+        hashes = {}
+        for hashobj in hasher.hashes:
+            hashvalue = hashobj.hexdigest()
+            if hashvalue:
+                hashes[hashobj.name] = hashvalue
+        yield (elem.name, elem.size, hashes)
-- 
cgit v1.2.3


From d228c0a4a5827325bca47d63ea287c7cb56537ea Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Thu, 3 Oct 2013 08:51:41 +0200
Subject: work around python-debian's #670679

---
 dedup/debpkg.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'dedup/debpkg.py')

diff --git a/dedup/debpkg.py b/dedup/debpkg.py
index d8cc22f..2d67135 100644
--- a/dedup/debpkg.py
+++ b/dedup/debpkg.py
@@ -16,8 +16,8 @@ def process_control(control_contents):
         source = package
     version = control["version"].encode("ascii")
     architecture = control["architecture"].encode("ascii")
-
-    depends = set(dep[0]["name"].encode("ascii")
+    # deb822 currently returns :any dependencies raw. see #670679
+    depends = set(dep[0]["name"].split(u':', 1)[0].encode("ascii")
                   for dep in control.relations.get("depends", ())
                   if len(dep) == 1)
     return dict(package=package, source=source, version=version,
-- 
cgit v1.2.3