summaryrefslogtreecommitdiff
path: root/dedup
diff options
context:
space:
mode:
Diffstat (limited to 'dedup')
-rw-r--r--dedup/compression.py15
-rw-r--r--dedup/debpkg.py55
-rw-r--r--dedup/hashing.py56
-rw-r--r--dedup/image.py2
-rw-r--r--dedup/templates/base.html4
-rw-r--r--dedup/templates/binary.html2
-rw-r--r--dedup/templates/index.html2
-rw-r--r--dedup/utils.py6
8 files changed, 136 insertions, 6 deletions
diff --git a/dedup/compression.py b/dedup/compression.py
index 869c49f..4ce258c 100644
--- a/dedup/compression.py
+++ b/dedup/compression.py
@@ -5,8 +5,11 @@ class GzipDecompressor(object):
"""An interface to gzip which is similar to bz2.BZ2Decompressor and
lzma.LZMADecompressor."""
def __init__(self):
+ self.sawheader = False
self.inbuffer = b""
self.decompressor = None
+ self.crc = 0
+ self.size = 0
def decompress(self, data):
"""
@@ -16,6 +19,8 @@ class GzipDecompressor(object):
while True:
if self.decompressor:
data = self.decompressor.decompress(data)
+ self.crc = zlib.crc32(data, self.crc)
+ self.size += len(data)
unused_data = self.decompressor.unused_data
if not unused_data:
return data
@@ -45,13 +50,20 @@ class GzipDecompressor(object):
return b""
data = self.inbuffer[skip:]
self.inbuffer = b""
+ self.sawheader = True
self.decompressor = zlib.decompressobj(-zlib.MAX_WBITS)
@property
def unused_data(self):
if self.decompressor:
return self.decompressor.unused_data
+ elif not self.sawheader:
+ return self.inbuffer
else:
+ expect = struct.pack("<ll", self.crc, self.size)
+ if self.inbuffer.startswith(expect) and \
+ self.inbuffer[len(expect):].replace("\0", "") == "":
+ return b""
return self.inbuffer
def flush(self):
@@ -67,6 +79,9 @@ class GzipDecompressor(object):
new.inbuffer = self.inbuffer
if self.decompressor:
new.decompressor = self.decompressor.copy()
+ new.sawheader = self.sawheader
+ new.crc = self.crc
+ new.size = self.size
return new
class DecompressedStream(object):
diff --git a/dedup/debpkg.py b/dedup/debpkg.py
new file mode 100644
index 0000000..2d67135
--- /dev/null
+++ b/dedup/debpkg.py
@@ -0,0 +1,55 @@
+from debian import deb822
+
+from dedup.hashing import hash_file
+
+def process_control(control_contents):
+ """Parses the contents of a control file from a control.tar.gz of a Debian
+ package and returns a dictionary containing the fields relevant to dedup.
+ @type control_contents: bytes
+ @rtype: {str: object}
+ """
+ control = deb822.Packages(control_contents)
+ package = control["package"].encode("ascii")
+ try:
+ source = control["source"].encode("ascii").split()[0]
+ except KeyError:
+ source = package
+ version = control["version"].encode("ascii")
+ architecture = control["architecture"].encode("ascii")
+ # deb822 currently returns :any dependencies raw. see #670679
+ depends = set(dep[0]["name"].split(u':', 1)[0].encode("ascii")
+ for dep in control.relations.get("depends", ())
+ if len(dep) == 1)
+ return dict(package=package, source=source, version=version,
+ architecture=architecture, depends=depends)
+
+class MultiHash(object):
+ def __init__(self, *hashes):
+ self.hashes = hashes
+
+ def update(self, data):
+ for hasher in self.hashes:
+ hasher.update(data)
+
+def get_tar_hashes(tar, hash_functions):
+ """Given a TarFile read all regular files and compute all of the given hash
+ functions on each file.
+ @type tar: tarfile.TarFile
+ @param hash_functions: a sequence of parameter-less functions each creating a
+ new hashlib-like object
+ @rtype: gen((str, int, {str: str}}
+ @returns: an iterable of (filename, filesize, hashes) tuples where
+ hashes is a dict mapping hash function names to hash values
+ """
+
+ for elem in tar:
+ if not elem.isreg(): # excludes hard links as well
+ continue
+ hasher = MultiHash(*[func() for func in hash_functions])
+ hasher = hash_file(hasher, tar.extractfile(elem))
+ hashes = {}
+ for hashobj in hasher.hashes:
+ hashvalue = hashobj.hexdigest()
+ if hashvalue:
+ hashes[hashobj.name] = hashvalue
+ yield (elem.name, elem.size, hashes)
diff --git a/dedup/hashing.py b/dedup/hashing.py
index 002eda8..a8a46c7 100644
--- a/dedup/hashing.py
+++ b/dedup/hashing.py
@@ -1,3 +1,5 @@
+import itertools
+
class HashBlacklist(object):
"""Turn a hashlib-like object into a hash that returns None for some
blacklisted hashes instead of the real hash value.
@@ -29,6 +31,54 @@ class HashBlacklist(object):
def copy(self):
return HashBlacklist(self.hashobj.copy(), self.blacklist)
+class HashBlacklistContent(object):
+ """Turn a hashlib-like object into a hash that returns None for some
+ blacklisted content instead of the real hash value. Unlike HashBlacklist,
+ not the output of the hash is considered, but its input."""
+
+ def __init__(self, hashobj, blacklist=(), maxlen=None):
+ """
+ @param hashobj: a hashlib-like object
+ @param blacklist: an object providing __contains__.
+ hash inputs which are contained in the blacklist
+ are turned into None values
+ @param maxlen: the maximum length of a blacklisted input.
+ Defaults to max(map(len, blacklist)), so if it is absent,
+ the blacklist must support iteration.
+ """
+ self.hashobj = hashobj
+ self.blacklist = blacklist
+ if maxlen is None:
+ # the chain avoids passing the empty sequence to max
+ maxlen = max(itertools.chain((0,), itertools.imap(len, blacklist)))
+ self.maxlen = maxlen
+ self.stored = ""
+
+ @property
+ def name(self):
+ return self.hashobj.name
+
+ def update(self, data):
+ if self.stored is not None:
+ self.stored += data
+ if len(self.stored) > self.maxlen:
+ self.stored = None
+ self.hashobj.update(data)
+
+ def digest(self):
+ if self.stored is not None and self.stored in self.blacklist:
+ return None
+ return self.hashobj.digest()
+
+ def hexdigest(self):
+ if self.stored is not None and self.stored in self.blacklist:
+ return None
+ return self.hashobj.hexdigest()
+
+ def copy(self):
+ return HashBlacklistContent(self.hashobj.copy(), self.blacklist,
+ self.maxlen)
+
class DecompressedHash(object):
"""Apply a decompression function before the hash. This class provides the
hashlib interface (update, hexdigest, copy) excluding digest and name."""
@@ -49,9 +99,13 @@ class DecompressedHash(object):
def hexdigest(self):
if not hasattr(self.decompressor, "flush"):
+ if self.decompressor.unused_data:
+ raise ValueError("decompressor did not consume all data")
return self.hashobj.hexdigest()
tmpdecomp = self.decompressor.copy()
data = tmpdecomp.flush()
+ if tmpdecomp.unused_data:
+ raise ValueError("decompressor did not consume all data")
tmphash = self.hashobj.copy()
tmphash.update(data)
return tmphash.hexdigest()
@@ -61,7 +115,7 @@ class DecompressedHash(object):
class SuppressingHash(object):
"""A hash that silences exceptions from the update and hexdigest methods of
- a hashlib-like object. If an exception has occured, hexdigest always
+ a hashlib-like object. If an exception has occurred, hexdigest always
returns None."""
def __init__(self, hashobj, exceptions=()):
"""
diff --git a/dedup/image.py b/dedup/image.py
index c1f2de0..ef17989 100644
--- a/dedup/image.py
+++ b/dedup/image.py
@@ -4,7 +4,7 @@ import struct
import PIL.Image
class ImageHash(object):
- """A hash on the contents of an image datat type supported by PIL. This
+ """A hash on the contents of an image data type supported by PIL. This
disregards mode, depth and meta information. Note that due to limitations
in PIL and the image format (interlacing) the full contents are stored and
decoded in hexdigest."""
diff --git a/dedup/templates/base.html b/dedup/templates/base.html
index 62f4087..9dfb788 100644
--- a/dedup/templates/base.html
+++ b/dedup/templates/base.html
@@ -3,8 +3,8 @@
<head>
<title>{% block title %}{% endblock %}</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
- <link rel="stylesheet" type="text/css" href="{{ urlroot|e }}/style.css">
- <link rel="icon" type="image/vnd.microsoft.icon" href="{{ urlroot|e }}/favicon.ico">
+ <link rel="stylesheet" type="text/css" href="{{ urlroot|e }}/static/style.css">
+ <link rel="icon" type="image/vnd.microsoft.icon" href="{{ urlroot|e }}/static/favicon.ico">
{% block header %}{% endblock %}
</head>
<body>
diff --git a/dedup/templates/binary.html b/dedup/templates/binary.html
index 69eceef..46c4fa6 100644
--- a/dedup/templates/binary.html
+++ b/dedup/templates/binary.html
@@ -21,7 +21,7 @@
<p>Note: Packages with yellow background are required to be installed when this package is installed.</p>
{%- endif -%}
{%- if issues -%}
- <h3>issues with particular files</h3>
+ <h3 id="issues">issues with particular files</h3>
<table border='1'><tr><th>filename</th><th>issue</th></tr>
{%- for filename, issue in issues|dictsort(true) -%}
<tr><td><span class="filename">{{ filename|e }}</span></td><td>{{ issue|e }}</td></tr>
diff --git a/dedup/templates/index.html b/dedup/templates/index.html
index 7c9000f..169027e 100644
--- a/dedup/templates/index.html
+++ b/dedup/templates/index.html
@@ -28,7 +28,7 @@
{% block content %}
<h1>Debian duplication detector</h1>
<ul>
-<li>To inspect a particlar binary package, go to <pre>binary/&lt;packagename&gt;</pre> Example: <a href="binary/git">binary/git</a>
+<li>To inspect a particular binary package, go to <pre>binary/&lt;packagename&gt;</pre> Example: <a href="binary/git">binary/git</a>
<div style="display:none" id="form_div"><fieldset>
<legend>Inspect package</legend>
<noscript><b>This form is dysfunctional when javascript is not enabled</b></noscript>
diff --git a/dedup/utils.py b/dedup/utils.py
index 6864ad3..fd30378 100644
--- a/dedup/utils.py
+++ b/dedup/utils.py
@@ -1,3 +1,4 @@
+from debian.debian_support import version_compare
import sqlalchemy.event
def fetchiter(cursor):
@@ -12,3 +13,8 @@ def enable_sqlite_foreign_keys(engine):
def pragma_foreign_keys(connection, _):
connection.execute("PRAGMA foreign_keys=ON;")
+def sqlite_add_version_compare(engine):
+ @sqlalchemy.event.listens_for(engine, "connect")
+ def add_version_compare(connection, _):
+ connection.create_collation("debian_version", version_compare)
+ connection.create_function("debian_version_compare", 2, version_compare)