summaryrefslogtreecommitdiff
path: root/dedup
diff options
context:
space:
mode:
Diffstat (limited to 'dedup')
-rw-r--r--dedup/arreader.py2
-rw-r--r--dedup/compression.py15
-rw-r--r--dedup/debpkg.py6
-rw-r--r--dedup/hashing.py56
-rw-r--r--dedup/image.py2
-rw-r--r--dedup/templates/index.html2
6 files changed, 76 insertions, 7 deletions
diff --git a/dedup/arreader.py b/dedup/arreader.py
index d74ae37..e53efd9 100644
--- a/dedup/arreader.py
+++ b/dedup/arreader.py
@@ -45,7 +45,7 @@ class ArReader(object):
if not file_header:
raise EOFError("end of archive found")
parts = struct.unpack("16s 12s 6s 6s 8s 10s 2s", file_header)
- parts = [p.rstrip(b" ") for p in parts]
+ parts = [p.rstrip(b"/ ") for p in parts]
if parts.pop() != self.file_magic:
raise ValueError("ar file header not found")
self.remaining = int(parts[5])
diff --git a/dedup/compression.py b/dedup/compression.py
index 869c49f..4ce258c 100644
--- a/dedup/compression.py
+++ b/dedup/compression.py
@@ -5,8 +5,11 @@ class GzipDecompressor(object):
"""An interface to gzip which is similar to bz2.BZ2Decompressor and
lzma.LZMADecompressor."""
def __init__(self):
+ self.sawheader = False
self.inbuffer = b""
self.decompressor = None
+ self.crc = 0
+ self.size = 0
def decompress(self, data):
"""
@@ -16,6 +19,8 @@ class GzipDecompressor(object):
while True:
if self.decompressor:
data = self.decompressor.decompress(data)
+ self.crc = zlib.crc32(data, self.crc)
+ self.size += len(data)
unused_data = self.decompressor.unused_data
if not unused_data:
return data
@@ -45,13 +50,20 @@ class GzipDecompressor(object):
return b""
data = self.inbuffer[skip:]
self.inbuffer = b""
+ self.sawheader = True
self.decompressor = zlib.decompressobj(-zlib.MAX_WBITS)
@property
def unused_data(self):
if self.decompressor:
return self.decompressor.unused_data
+ elif not self.sawheader:
+ return self.inbuffer
else:
+ expect = struct.pack("<ll", self.crc, self.size)
+ if self.inbuffer.startswith(expect) and \
+ self.inbuffer[len(expect):].replace("\0", "") == "":
+ return b""
return self.inbuffer
def flush(self):
@@ -67,6 +79,9 @@ class GzipDecompressor(object):
new.inbuffer = self.inbuffer
if self.decompressor:
new.decompressor = self.decompressor.copy()
+ new.sawheader = self.sawheader
+ new.crc = self.crc
+ new.size = self.size
return new
class DecompressedStream(object):
diff --git a/dedup/debpkg.py b/dedup/debpkg.py
index 6d857ba..cd8616f 100644
--- a/dedup/debpkg.py
+++ b/dedup/debpkg.py
@@ -3,7 +3,7 @@ from debian import deb822
from dedup.hashing import hash_file
def process_control(control_contents):
- """Parses the contents of a control file from a control.tar.gz of a Debian
+ """Parses the contents of a control file from a control.tar of a Debian
package and returns a dictionary containing the fields relevant to dedup.
@type control_contents: bytes
@rtype: {str: object}
@@ -16,8 +16,8 @@ def process_control(control_contents):
source = package
version = control["version"].encode("ascii")
architecture = control["architecture"].encode("ascii")
-
- depends = set(dep[0]["name"].encode("ascii")
+ # deb822 currently returns :any dependencies raw. see #670679
+ depends = set(dep[0]["name"].split(u':', 1)[0].encode("ascii")
for dep in control.relations.get("depends", ())
if len(dep) == 1)
ret = dict(package=package, source=source, version=version,
diff --git a/dedup/hashing.py b/dedup/hashing.py
index 002eda8..a8a46c7 100644
--- a/dedup/hashing.py
+++ b/dedup/hashing.py
@@ -1,3 +1,5 @@
+import itertools
+
class HashBlacklist(object):
"""Turn a hashlib-like object into a hash that returns None for some
blacklisted hashes instead of the real hash value.
@@ -29,6 +31,54 @@ class HashBlacklist(object):
def copy(self):
return HashBlacklist(self.hashobj.copy(), self.blacklist)
+class HashBlacklistContent(object):
+ """Turn a hashlib-like object into a hash that returns None for some
+ blacklisted content instead of the real hash value. Unlike HashBlacklist,
+ not the output of the hash is considered, but its input."""
+
+ def __init__(self, hashobj, blacklist=(), maxlen=None):
+ """
+ @param hashobj: a hashlib-like object
+ @param blacklist: an object providing __contains__.
+ hash inputs which are contained in the blacklist
+ are turned into None values
+ @param maxlen: the maximum length of a blacklisted input.
+ Defaults to max(map(len, blacklist)), so if it is absent,
+ the blacklist must support iteration.
+ """
+ self.hashobj = hashobj
+ self.blacklist = blacklist
+ if maxlen is None:
+ # the chain avoids passing the empty sequence to max
+ maxlen = max(itertools.chain((0,), itertools.imap(len, blacklist)))
+ self.maxlen = maxlen
+ self.stored = ""
+
+ @property
+ def name(self):
+ return self.hashobj.name
+
+ def update(self, data):
+ if self.stored is not None:
+ self.stored += data
+ if len(self.stored) > self.maxlen:
+ self.stored = None
+ self.hashobj.update(data)
+
+ def digest(self):
+ if self.stored is not None and self.stored in self.blacklist:
+ return None
+ return self.hashobj.digest()
+
+ def hexdigest(self):
+ if self.stored is not None and self.stored in self.blacklist:
+ return None
+ return self.hashobj.hexdigest()
+
+ def copy(self):
+ return HashBlacklistContent(self.hashobj.copy(), self.blacklist,
+ self.maxlen)
+
class DecompressedHash(object):
"""Apply a decompression function before the hash. This class provides the
hashlib interface (update, hexdigest, copy) excluding digest and name."""
@@ -49,9 +99,13 @@ class DecompressedHash(object):
def hexdigest(self):
if not hasattr(self.decompressor, "flush"):
+ if self.decompressor.unused_data:
+ raise ValueError("decompressor did not consume all data")
return self.hashobj.hexdigest()
tmpdecomp = self.decompressor.copy()
data = tmpdecomp.flush()
+ if tmpdecomp.unused_data:
+ raise ValueError("decompressor did not consume all data")
tmphash = self.hashobj.copy()
tmphash.update(data)
return tmphash.hexdigest()
@@ -61,7 +115,7 @@ class DecompressedHash(object):
class SuppressingHash(object):
"""A hash that silences exceptions from the update and hexdigest methods of
- a hashlib-like object. If an exception has occured, hexdigest always
+ a hashlib-like object. If an exception has occurred, hexdigest always
returns None."""
def __init__(self, hashobj, exceptions=()):
"""
diff --git a/dedup/image.py b/dedup/image.py
index c1f2de0..ef17989 100644
--- a/dedup/image.py
+++ b/dedup/image.py
@@ -4,7 +4,7 @@ import struct
import PIL.Image
class ImageHash(object):
- """A hash on the contents of an image datat type supported by PIL. This
+ """A hash on the contents of an image data type supported by PIL. This
disregards mode, depth and meta information. Note that due to limitations
in PIL and the image format (interlacing) the full contents are stored and
decoded in hexdigest."""
diff --git a/dedup/templates/index.html b/dedup/templates/index.html
index 7c9000f..169027e 100644
--- a/dedup/templates/index.html
+++ b/dedup/templates/index.html
@@ -28,7 +28,7 @@
{% block content %}
<h1>Debian duplication detector</h1>
<ul>
-<li>To inspect a particlar binary package, go to <pre>binary/&lt;packagename&gt;</pre> Example: <a href="binary/git">binary/git</a>
+<li>To inspect a particular binary package, go to <pre>binary/&lt;packagename&gt;</pre> Example: <a href="binary/git">binary/git</a>
<div style="display:none" id="form_div"><fieldset>
<legend>Inspect package</legend>
<noscript><b>This form is dysfunctional when javascript is not enabled</b></noscript>