summaryrefslogtreecommitdiff
path: root/dedup
diff options
context:
space:
mode:
Diffstat (limited to 'dedup')
-rw-r--r--dedup/hashing.py19
-rw-r--r--dedup/image.py67
-rw-r--r--dedup/static/favicon.icobin0 -> 4286 bytes
-rw-r--r--dedup/static/style.css12
-rw-r--r--dedup/templates/base.html22
-rw-r--r--dedup/templates/binary.html31
-rw-r--r--dedup/templates/compare.html27
-rw-r--r--dedup/templates/hash.html12
-rw-r--r--dedup/templates/index.html44
-rw-r--r--dedup/templates/source.html15
10 files changed, 227 insertions, 22 deletions
diff --git a/dedup/hashing.py b/dedup/hashing.py
index 1283c7e..002eda8 100644
--- a/dedup/hashing.py
+++ b/dedup/hashing.py
@@ -106,3 +106,22 @@ def hash_file(hashobj, filelike, blocksize=65536):
hashobj.update(data)
data = filelike.read(blocksize)
return hashobj
+
+class HashedStream(object):
+ """A file-like object, that supports sequential reading and hashes the
+ contents on the fly."""
+ def __init__(self, filelike, hashobj):
+ """
+ @param filelike: a file-like object, that must support the read method
+ @param hashobj: a hashlib-like object providing update and hexdigest
+ """
+ self.filelike = filelike
+ self.hashobj = hashobj
+
+ def read(self, length):
+ data = self.filelike.read(length)
+ self.hashobj.update(data)
+ return data
+
+ def hexdigest(self):
+ return self.hashobj.hexdigest()
diff --git a/dedup/image.py b/dedup/image.py
index 1148890..c1f2de0 100644
--- a/dedup/image.py
+++ b/dedup/image.py
@@ -4,9 +4,10 @@ import struct
import PIL.Image
class ImageHash(object):
- """A hash on the contents of an image. This disregards mode, depth and meta
- information. Note that due to limitations in PIL and the image format
- (interlacing) the full contents are stored and decoded in hexdigest."""
+ """A hash on the contents of an image datat type supported by PIL. This
+ disregards mode, depth and meta information. Note that due to limitations
+ in PIL and the image format (interlacing) the full contents are stored and
+ decoded in hexdigest."""
maxsize = 1024 * 1024 * 32
# max memory usage is about 5 * maxpixels in bytes
maxpixels = 1024 * 1024 * 32
@@ -19,33 +20,25 @@ class ImageHash(object):
self.imagedetected = False
self.content = io.BytesIO()
+ def detect(self):
+ raise NotImplementedError
+
def update(self, data):
self.content.write(data)
if self.content.tell() > self.maxsize:
raise ValueError("maximum image size exceeded")
- if self.imagedetected:
- return
- if self.content.tell() < 33: # header + IHDR
- return
- curvalue = self.content.getvalue()
- if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"):
- width, height = struct.unpack(">II", curvalue[16:24])
- if width * height > self.maxpixels:
- raise ValueError("maximum image pixels exceeded")
- self.imagedetected = True
- return
- raise ValueError("not a png image")
+ if not self.imagedetected:
+ self.imagedetected = self.detect()
def copy(self):
- new = ImageHash()
- new.hashobj = self.hashobj.copy()
+ new = self.__class__(self.hashobj.copy())
new.imagedetected = self.imagedetected
new.content = io.BytesIO(self.content.getvalue())
return new
def hexdigest(self):
if not self.imagedetected:
- raise ValueError("not a png image")
+ raise ValueError("not a image")
hashobj = self.hashobj.copy()
pos = self.content.tell()
try:
@@ -53,7 +46,7 @@ class ImageHash(object):
try:
img = PIL.Image.open(self.content)
except IOError:
- raise ValueError("broken png header")
+ raise ValueError("broken header")
width, height = img.size
pack = lambda elem: struct.pack("BBBB", *elem)
# special casing easy modes reduces memory usage
@@ -64,13 +57,43 @@ class ImageHash(object):
elif img.mode != "RGBA":
try:
img = img.convert("RGBA")
- except (SyntaxError, IndexError, IOError): # crazy stuff from PIL
- raise ValueError("error reading png image")
+ except (SyntaxError, IndexError, IOError):
+ # crazy stuff from PIL
+ raise ValueError("error reading image")
try:
for elem in img.getdata():
hashobj.update(pack(elem))
except (SyntaxError, IndexError, IOError): # crazy stuff from PIL
- raise ValueError("error reading png image")
+ raise ValueError("error reading image")
finally:
self.content.seek(pos)
return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height)
+
+
+class PNGHash(ImageHash):
+ """A hash on the contents of a PNG image."""
+
+ def detect(self):
+ if self.content.tell() < 33: # header + IHDR
+ return False
+ curvalue = self.content.getvalue()
+ if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"):
+ width, height = struct.unpack(">II", curvalue[16:24])
+ if width * height > self.maxpixels:
+ raise ValueError("maximum image pixels exceeded")
+ return True
+ raise ValueError("not a png image")
+
+class GIFHash(ImageHash):
+ """A hash on the contents of the first frame of a GIF image."""
+
+ def detect(self):
+ if self.content.tell() < 10: # magic + logical dimension
+ return False
+ curvalue = self.content.getvalue()
+ if curvalue.startswith((b"GIF87a", "GIF89a")):
+ width, height = struct.unpack("<HH", curvalue[6:10])
+ if width * height > self.maxpixels:
+ raise ValueError("maximum image pixels exceeded")
+ return True
+ raise ValueError("not a png image")
diff --git a/dedup/static/favicon.ico b/dedup/static/favicon.ico
new file mode 100644
index 0000000..5039835
--- /dev/null
+++ b/dedup/static/favicon.ico
Binary files differ
diff --git a/dedup/static/style.css b/dedup/static/style.css
new file mode 100644
index 0000000..531ef9d
--- /dev/null
+++ b/dedup/static/style.css
@@ -0,0 +1,12 @@
+.dependency {
+ background-color: yellow;
+}
+td {
+ vertical-align: top;
+}
+.filename {
+ display: block;
+}
+.filename:hover {
+ background-color: #eee;
+}
diff --git a/dedup/templates/base.html b/dedup/templates/base.html
new file mode 100644
index 0000000..62f4087
--- /dev/null
+++ b/dedup/templates/base.html
@@ -0,0 +1,22 @@
+<!DOCTYPE html>
+<html>
+ <head>
+ <title>{% block title %}{% endblock %}</title>
+ <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+ <link rel="stylesheet" type="text/css" href="{{ urlroot|e }}/style.css">
+ <link rel="icon" type="image/vnd.microsoft.icon" href="{{ urlroot|e }}/favicon.ico">
+ {% block header %}{% endblock %}
+ </head>
+ <body>
+{% block content %}
+{% endblock %}
+<hr>
+<h4>Details about this service</h4>
+<ul>
+ <li>More information: <a href="https://wiki.debian.org/dedup.debian.net">see wiki</a></li>
+ <li>Maintainer: Helmut Grohne &lt;helmut@subdivi.de&gt;</li>
+ <li>Source: git://murkel.subdivi.de/~helmut/debian-dedup.git</li>
+ <li>Bugs reports / Feedback / Patches: to the maintainer</li>
+</ul>
+ </body>
+</html>
diff --git a/dedup/templates/binary.html b/dedup/templates/binary.html
new file mode 100644
index 0000000..69eceef
--- /dev/null
+++ b/dedup/templates/binary.html
@@ -0,0 +1,31 @@
+{% extends "base.html" %}
+{% block title %}duplication of {{ package|e }}{% endblock %}
+{% block content %}<h1>{{ package|e }}</h1>
+<p>Version: {{ version|e }}</p>
+<p>Architecture: {{ architecture|e }}</p>
+<p>Number of files: {{ num_files }}</p>
+<p>Total size: {{ total_size|filesizeformat }}</p>
+{%- if shared -%}
+ {%- for function, sharing in shared.items() -%}
+ <h3>sharing with respect to {{ function|e }}</h3>
+ <table border='1'><tr><th>package</th><th>files shared</th><th>data shared</th></tr>
+ {%- for entry in sharing|sort(attribute="savable", reverse=true) -%}
+ <tr><td{% if not entry.package or entry.package in dependencies %} class="dependency"{% endif %}>
+ {%- if entry.package %}<a href="{{ entry.package|e }}"><span class="binary-package">{{ entry.package|e }}</span></a>{% else %}self{% endif %}
+ <a href="../compare/{{ package|e }}/{{ entry.package|default(package, true)|e }}">compare</a></td>
+ <td>{{ entry.duplicate }} ({{ (100 * entry.duplicate / num_files)|int }}%)</td>
+ <td>{{ entry.savable|filesizeformat }} ({{ (100 * entry.savable / total_size)|int }}%)</td></tr>
+ {%- endfor -%}
+ </table>
+ {%- endfor -%}
+<p>Note: Packages with yellow background are required to be installed when this package is installed.</p>
+{%- endif -%}
+{%- if issues -%}
+ <h3>issues with particular files</h3>
+ <table border='1'><tr><th>filename</th><th>issue</th></tr>
+ {%- for filename, issue in issues|dictsort(true) -%}
+ <tr><td><span class="filename">{{ filename|e }}</span></td><td>{{ issue|e }}</td></tr>
+ {%- endfor -%}
+ </table>
+{%- endif -%}
+{% endblock %}
diff --git a/dedup/templates/compare.html b/dedup/templates/compare.html
new file mode 100644
index 0000000..f78e80f
--- /dev/null
+++ b/dedup/templates/compare.html
@@ -0,0 +1,27 @@
+{% extends "base.html" %}
+{% block title %}sharing between {{ details1.package|e }} and {{ details2.package|e }}{% endblock%}
+{% block content %}
+<h1><a href="../../binary/{{ details1.package|e }}">{{ details1.package|e }}</a> &lt;-&gt; <a href="../../binary/{{ details2.package|e }}">{{ details2.package|e }}</a></h1>
+<p>Version of {{ details1.package|e }}: {{ details1.version|e }}</p>
+<p>Architecture of {{ details1.package|e }}: {{ details1.architecture|e }}</p>
+{%- if details1.package != details2.package -%}
+<p>Version of {{ details2.package|e }}: {{ details2.version|e }}</p>
+<p>Architecture of {{ details2.package|e }}: {{ details2.architecture|e }}</p>
+{%- endif -%}
+<table border='1'><tr><th colspan="2">{{ details1.package|e }}</th><th colspan="2">{{ details2.package|e }}</th></tr>
+<tr><th>size</th><th>filename</th><th>hash functions</th><th>filename</th></tr>
+{%- for entry in shared -%}
+ <tr><td{% if entry.matches|length > 1 %} rowspan={{ entry.matches|length }}{% endif %}>{{ entry.size|filesizeformat }}</td><td{% if entry.matches|length > 1 %} rowspan={{ entry.matches|length }}{% endif %}>
+ {%- for filename in entry.filenames %}<span class="filename">{{ filename|e }}</span>{% endfor -%}</td><td>
+ {% for filename, match in entry.matches.items() -%}
+ {% if not loop.first %}<tr><td>{% endif -%}
+ {%- for funccomb, hashvalue in match.items() -%}
+ <a href="../../hash/{{ funccomb[0]|e }}/{{ hashvalue|e }}">{{ funccomb[0]|e }}</a>
+ {%- if funccomb[0] != funccomb[1] %} -&gt; <a href="../../hash/{{ funccomb[1]|e }}/{{ hashvalue|e }}">{{ funccomb[1]|e }}</a>{% endif %}
+ {%- if not loop.last %}, {% endif %}
+ {%- endfor -%}
+ </td><td><span class="filename">{{ filename|e }}</span></td></tr>
+ {%- endfor -%}
+{%- endfor -%}
+</table>
+{% endblock %}
diff --git a/dedup/templates/hash.html b/dedup/templates/hash.html
new file mode 100644
index 0000000..7141f96
--- /dev/null
+++ b/dedup/templates/hash.html
@@ -0,0 +1,12 @@
+{% extends "base.html" %}
+{% block title %}information on {{ function|e }} hash {{ hashvalue|e }}{% endblock %}
+{% block content %}
+<h1>{{ function|e }} {{ hashvalue|e }}</h1>
+<table border='1'><tr><th>package</th><th>filename</th><th>size</th><th>different function</th></tr>
+{%- for entry in entries -%}
+ <tr><td><a href="../../binary/{{ entry.package|e }}"><span class="binary-package">{{ entry.package|e }}</span></a></td>
+ <td><span class="filename">{{ entry.filename|e }}</span></td><td>{{ entry.size|filesizeformat }}</td>
+ <td>{% if function != entry.function %}{{ entry.function|e }}{% endif %}</td></tr>
+{%- endfor -%}
+</table>
+{% endblock %}
diff --git a/dedup/templates/index.html b/dedup/templates/index.html
new file mode 100644
index 0000000..7c9000f
--- /dev/null
+++ b/dedup/templates/index.html
@@ -0,0 +1,44 @@
+{% extends "base.html" %}
+{% block title %}Debian duplication detector{% endblock %}
+{% block header %}
+ <script type="text/javascript">
+ function getLinkTarget() {
+ var pkg = document.getElementById("pkg_name").value;
+ if(pkg) {
+ return "/binary/"+pkg;
+ }
+ return '#';
+ }
+ function processData() {
+ var link = document.getElementById("perma_link");
+ link.href = getLinkTarget();
+ link.text = location.href + getLinkTarget();
+ }
+ window.onload = function() {
+ document.getElementById('pkg_name').onkeyup = processData;
+ document.getElementById("pkg_form").onsubmit = function () {
+ location.href = getLinkTarget();
+ return false;
+ }
+ processData();
+ document.getElementById("form_div").style.display = '';
+ }
+ </script>
+{% endblock %}
+{% block content %}
+<h1>Debian duplication detector</h1>
+<ul>
+<li>To inspect a particlar binary package, go to <pre>binary/&lt;packagename&gt;</pre> Example: <a href="binary/git">binary/git</a>
+ <div style="display:none" id="form_div"><fieldset>
+ <legend>Inspect package</legend>
+ <noscript><b>This form is dysfunctional when javascript is not enabled</b></noscript>
+ Enter binary package to inspect - Note: Non-existing packages will result in <b>404</b>-Errors
+ <form id="pkg_form">
+ <label for="pkg_name">Name: </label><input type="text" size="30" name="pkg_name" id="pkg_name">
+ <input type="submit" value="Go"> Permanent Link: <a id="perma_link" href="#"></a>
+ </form>
+ </fieldset></div></li>
+<li>To inspect a combination of binary packages go to <pre>compare/&lt;firstpackage&gt;/&lt;secondpackage&gt;</pre> Example: <a href="compare/git/git">compare/git/git</a></li>
+<li>To discover package shipping a particular file go to <pre>hash/sha512/&lt;hashvalue&gt;</pre> Example: <a href="hash/sha512/7633623b66b5e686bb94dd96a7cdb5a7e5ee00e87004fab416a5610d59c62badaf512a2e26e34e2455b7ed6b76690d2cd47464836d7d85d78b51d50f7e933d5c">hash/sha512/7633623b66b5e686bb94dd96a7cdb5a7e5ee00e87004fab416a5610d59c62badaf512a2e26e34e2455b7ed6b76690d2cd47464836d7d85d78b51d50f7e933d5c</a></li>
+</ul>
+{% endblock %}
diff --git a/dedup/templates/source.html b/dedup/templates/source.html
new file mode 100644
index 0000000..fc679b0
--- /dev/null
+++ b/dedup/templates/source.html
@@ -0,0 +1,15 @@
+{% extends "base.html" %}
+{% block title %}overview of {{ source|e }}{% endblock %}
+{% block content %}
+<h1>overview of {{ source|e }}</h1>
+<table border='1'><tr><th>binary from {{ source|e }}</th><th>savable</th><th>other package</th></tr>
+{%- for package, sharing in packages.items() -%}
+ <tr><td><a href="../binary/{{ package|e }}"><span class="binary-package">{{ package|e }}</span></a></td><td>
+ {%- if sharing -%}
+ {{ sharing.savable|filesizeformat }}</td><td><a href="../binary/{{ sharing.package|e }}"><span class="binary-package">{{ sharing.package|e }}</span></a> <a href="../compare/{{ package|e }}/{{ sharing.package|e }}">compare</a>
+ {%- else -%}</td><td>{%- endif -%}
+ </td></tr>
+{%- endfor -%}
+</table>
+<p>Note: Not all sharing listed here. Click on binary packages with non-zero savable to see more.</p>
+{% endblock %}