summaryrefslogtreecommitdiff
path: root/dedup
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2013-08-02 08:40:49 +0200
committerHelmut Grohne <helmut@subdivi.de>2013-08-02 08:40:49 +0200
commitcb3708825bf7ea32314040575cef35980dad0cd8 (patch)
tree31575a8525dc90ba6904268d94f47e1604bf0557 /dedup
parenta4bbbb6e664e605634cb3f9e0564c7e4a93697be (diff)
parent2712edb550968ce7ec8cd9800241d7944666631a (diff)
downloaddebian-dedup-cb3708825bf7ea32314040575cef35980dad0cd8.tar.gz
Merge branch master into sqlalchemy
This makes the sqlalchemy branch schema-compatible with master again. The biggest change on master was the introduction of the function table. It caused most of the conflicts. Note that webapp had one conflict not detected by git: The selecting of issues in show_package needed sqlalchemy conversion. Conflicts: README update_sharing.py webapp.py
Diffstat (limited to 'dedup')
-rw-r--r--dedup/hashing.py19
-rw-r--r--dedup/image.py67
-rw-r--r--dedup/static/favicon.icobin0 -> 4286 bytes
-rw-r--r--dedup/static/style.css12
-rw-r--r--dedup/templates/base.html22
-rw-r--r--dedup/templates/binary.html31
-rw-r--r--dedup/templates/compare.html27
-rw-r--r--dedup/templates/hash.html12
-rw-r--r--dedup/templates/index.html44
-rw-r--r--dedup/templates/source.html15
10 files changed, 227 insertions, 22 deletions
diff --git a/dedup/hashing.py b/dedup/hashing.py
index 1283c7e..002eda8 100644
--- a/dedup/hashing.py
+++ b/dedup/hashing.py
@@ -106,3 +106,22 @@ def hash_file(hashobj, filelike, blocksize=65536):
hashobj.update(data)
data = filelike.read(blocksize)
return hashobj
+
+class HashedStream(object):
+ """A file-like object, that supports sequential reading and hashes the
+ contents on the fly."""
+ def __init__(self, filelike, hashobj):
+ """
+ @param filelike: a file-like object, that must support the read method
+ @param hashobj: a hashlib-like object providing update and hexdigest
+ """
+ self.filelike = filelike
+ self.hashobj = hashobj
+
+ def read(self, length):
+ data = self.filelike.read(length)
+ self.hashobj.update(data)
+ return data
+
+ def hexdigest(self):
+ return self.hashobj.hexdigest()
diff --git a/dedup/image.py b/dedup/image.py
index 1148890..c1f2de0 100644
--- a/dedup/image.py
+++ b/dedup/image.py
@@ -4,9 +4,10 @@ import struct
import PIL.Image
class ImageHash(object):
- """A hash on the contents of an image. This disregards mode, depth and meta
- information. Note that due to limitations in PIL and the image format
- (interlacing) the full contents are stored and decoded in hexdigest."""
+ """A hash on the contents of an image datat type supported by PIL. This
+ disregards mode, depth and meta information. Note that due to limitations
+ in PIL and the image format (interlacing) the full contents are stored and
+ decoded in hexdigest."""
maxsize = 1024 * 1024 * 32
# max memory usage is about 5 * maxpixels in bytes
maxpixels = 1024 * 1024 * 32
@@ -19,33 +20,25 @@ class ImageHash(object):
self.imagedetected = False
self.content = io.BytesIO()
+ def detect(self):
+ raise NotImplementedError
+
def update(self, data):
self.content.write(data)
if self.content.tell() > self.maxsize:
raise ValueError("maximum image size exceeded")
- if self.imagedetected:
- return
- if self.content.tell() < 33: # header + IHDR
- return
- curvalue = self.content.getvalue()
- if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"):
- width, height = struct.unpack(">II", curvalue[16:24])
- if width * height > self.maxpixels:
- raise ValueError("maximum image pixels exceeded")
- self.imagedetected = True
- return
- raise ValueError("not a png image")
+ if not self.imagedetected:
+ self.imagedetected = self.detect()
def copy(self):
- new = ImageHash()
- new.hashobj = self.hashobj.copy()
+ new = self.__class__(self.hashobj.copy())
new.imagedetected = self.imagedetected
new.content = io.BytesIO(self.content.getvalue())
return new
def hexdigest(self):
if not self.imagedetected:
- raise ValueError("not a png image")
+ raise ValueError("not a image")
hashobj = self.hashobj.copy()
pos = self.content.tell()
try:
@@ -53,7 +46,7 @@ class ImageHash(object):
try:
img = PIL.Image.open(self.content)
except IOError:
- raise ValueError("broken png header")
+ raise ValueError("broken header")
width, height = img.size
pack = lambda elem: struct.pack("BBBB", *elem)
# special casing easy modes reduces memory usage
@@ -64,13 +57,43 @@ class ImageHash(object):
elif img.mode != "RGBA":
try:
img = img.convert("RGBA")
- except (SyntaxError, IndexError, IOError): # crazy stuff from PIL
- raise ValueError("error reading png image")
+ except (SyntaxError, IndexError, IOError):
+ # crazy stuff from PIL
+ raise ValueError("error reading image")
try:
for elem in img.getdata():
hashobj.update(pack(elem))
except (SyntaxError, IndexError, IOError): # crazy stuff from PIL
- raise ValueError("error reading png image")
+ raise ValueError("error reading image")
finally:
self.content.seek(pos)
return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height)
+
+
+class PNGHash(ImageHash):
+ """A hash on the contents of a PNG image."""
+
+ def detect(self):
+ if self.content.tell() < 33: # header + IHDR
+ return False
+ curvalue = self.content.getvalue()
+ if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"):
+ width, height = struct.unpack(">II", curvalue[16:24])
+ if width * height > self.maxpixels:
+ raise ValueError("maximum image pixels exceeded")
+ return True
+ raise ValueError("not a png image")
+
+class GIFHash(ImageHash):
+ """A hash on the contents of the first frame of a GIF image."""
+
+ def detect(self):
+ if self.content.tell() < 10: # magic + logical dimension
+ return False
+ curvalue = self.content.getvalue()
+ if curvalue.startswith((b"GIF87a", "GIF89a")):
+ width, height = struct.unpack("<HH", curvalue[6:10])
+ if width * height > self.maxpixels:
+ raise ValueError("maximum image pixels exceeded")
+ return True
+ raise ValueError("not a png image")
diff --git a/dedup/static/favicon.ico b/dedup/static/favicon.ico
new file mode 100644
index 0000000..5039835
--- /dev/null
+++ b/dedup/static/favicon.ico
Binary files differ
diff --git a/dedup/static/style.css b/dedup/static/style.css
new file mode 100644
index 0000000..531ef9d
--- /dev/null
+++ b/dedup/static/style.css
@@ -0,0 +1,12 @@
+.dependency {
+ background-color: yellow;
+}
+td {
+ vertical-align: top;
+}
+.filename {
+ display: block;
+}
+.filename:hover {
+ background-color: #eee;
+}
diff --git a/dedup/templates/base.html b/dedup/templates/base.html
new file mode 100644
index 0000000..62f4087
--- /dev/null
+++ b/dedup/templates/base.html
@@ -0,0 +1,22 @@
+<!DOCTYPE html>
+<html>
+ <head>
+ <title>{% block title %}{% endblock %}</title>
+ <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+ <link rel="stylesheet" type="text/css" href="{{ urlroot|e }}/style.css">
+ <link rel="icon" type="image/vnd.microsoft.icon" href="{{ urlroot|e }}/favicon.ico">
+ {% block header %}{% endblock %}
+ </head>
+ <body>
+{% block content %}
+{% endblock %}
+<hr>
+<h4>Details about this service</h4>
+<ul>
+ <li>More information: <a href="https://wiki.debian.org/dedup.debian.net">see wiki</a></li>
+ <li>Maintainer: Helmut Grohne &lt;helmut@subdivi.de&gt;</li>
+ <li>Source: git://murkel.subdivi.de/~helmut/debian-dedup.git</li>
+ <li>Bugs reports / Feedback / Patches: to the maintainer</li>
+</ul>
+ </body>
+</html>
diff --git a/dedup/templates/binary.html b/dedup/templates/binary.html
new file mode 100644
index 0000000..69eceef
--- /dev/null
+++ b/dedup/templates/binary.html
@@ -0,0 +1,31 @@
+{% extends "base.html" %}
+{% block title %}duplication of {{ package|e }}{% endblock %}
+{% block content %}<h1>{{ package|e }}</h1>
+<p>Version: {{ version|e }}</p>
+<p>Architecture: {{ architecture|e }}</p>
+<p>Number of files: {{ num_files }}</p>
+<p>Total size: {{ total_size|filesizeformat }}</p>
+{%- if shared -%}
+ {%- for function, sharing in shared.items() -%}
+ <h3>sharing with respect to {{ function|e }}</h3>
+ <table border='1'><tr><th>package</th><th>files shared</th><th>data shared</th></tr>
+ {%- for entry in sharing|sort(attribute="savable", reverse=true) -%}
+ <tr><td{% if not entry.package or entry.package in dependencies %} class="dependency"{% endif %}>
+ {%- if entry.package %}<a href="{{ entry.package|e }}"><span class="binary-package">{{ entry.package|e }}</span></a>{% else %}self{% endif %}
+ <a href="../compare/{{ package|e }}/{{ entry.package|default(package, true)|e }}">compare</a></td>
+ <td>{{ entry.duplicate }} ({{ (100 * entry.duplicate / num_files)|int }}%)</td>
+ <td>{{ entry.savable|filesizeformat }} ({{ (100 * entry.savable / total_size)|int }}%)</td></tr>
+ {%- endfor -%}
+ </table>
+ {%- endfor -%}
+<p>Note: Packages with yellow background are required to be installed when this package is installed.</p>
+{%- endif -%}
+{%- if issues -%}
+ <h3>issues with particular files</h3>
+ <table border='1'><tr><th>filename</th><th>issue</th></tr>
+ {%- for filename, issue in issues|dictsort(true) -%}
+ <tr><td><span class="filename">{{ filename|e }}</span></td><td>{{ issue|e }}</td></tr>
+ {%- endfor -%}
+ </table>
+{%- endif -%}
+{% endblock %}
diff --git a/dedup/templates/compare.html b/dedup/templates/compare.html
new file mode 100644
index 0000000..f78e80f
--- /dev/null
+++ b/dedup/templates/compare.html
@@ -0,0 +1,27 @@
+{% extends "base.html" %}
+{% block title %}sharing between {{ details1.package|e }} and {{ details2.package|e }}{% endblock%}
+{% block content %}
+<h1><a href="../../binary/{{ details1.package|e }}">{{ details1.package|e }}</a> &lt;-&gt; <a href="../../binary/{{ details2.package|e }}">{{ details2.package|e }}</a></h1>
+<p>Version of {{ details1.package|e }}: {{ details1.version|e }}</p>
+<p>Architecture of {{ details1.package|e }}: {{ details1.architecture|e }}</p>
+{%- if details1.package != details2.package -%}
+<p>Version of {{ details2.package|e }}: {{ details2.version|e }}</p>
+<p>Architecture of {{ details2.package|e }}: {{ details2.architecture|e }}</p>
+{%- endif -%}
+<table border='1'><tr><th colspan="2">{{ details1.package|e }}</th><th colspan="2">{{ details2.package|e }}</th></tr>
+<tr><th>size</th><th>filename</th><th>hash functions</th><th>filename</th></tr>
+{%- for entry in shared -%}
+ <tr><td{% if entry.matches|length > 1 %} rowspan={{ entry.matches|length }}{% endif %}>{{ entry.size|filesizeformat }}</td><td{% if entry.matches|length > 1 %} rowspan={{ entry.matches|length }}{% endif %}>
+ {%- for filename in entry.filenames %}<span class="filename">{{ filename|e }}</span>{% endfor -%}</td><td>
+ {% for filename, match in entry.matches.items() -%}
+ {% if not loop.first %}<tr><td>{% endif -%}
+ {%- for funccomb, hashvalue in match.items() -%}
+ <a href="../../hash/{{ funccomb[0]|e }}/{{ hashvalue|e }}">{{ funccomb[0]|e }}</a>
+ {%- if funccomb[0] != funccomb[1] %} -&gt; <a href="../../hash/{{ funccomb[1]|e }}/{{ hashvalue|e }}">{{ funccomb[1]|e }}</a>{% endif %}
+ {%- if not loop.last %}, {% endif %}
+ {%- endfor -%}
+ </td><td><span class="filename">{{ filename|e }}</span></td></tr>
+ {%- endfor -%}
+{%- endfor -%}
+</table>
+{% endblock %}
diff --git a/dedup/templates/hash.html b/dedup/templates/hash.html
new file mode 100644
index 0000000..7141f96
--- /dev/null
+++ b/dedup/templates/hash.html
@@ -0,0 +1,12 @@
+{% extends "base.html" %}
+{% block title %}information on {{ function|e }} hash {{ hashvalue|e }}{% endblock %}
+{% block content %}
+<h1>{{ function|e }} {{ hashvalue|e }}</h1>
+<table border='1'><tr><th>package</th><th>filename</th><th>size</th><th>different function</th></tr>
+{%- for entry in entries -%}
+ <tr><td><a href="../../binary/{{ entry.package|e }}"><span class="binary-package">{{ entry.package|e }}</span></a></td>
+ <td><span class="filename">{{ entry.filename|e }}</span></td><td>{{ entry.size|filesizeformat }}</td>
+ <td>{% if function != entry.function %}{{ entry.function|e }}{% endif %}</td></tr>
+{%- endfor -%}
+</table>
+{% endblock %}
diff --git a/dedup/templates/index.html b/dedup/templates/index.html
new file mode 100644
index 0000000..7c9000f
--- /dev/null
+++ b/dedup/templates/index.html
@@ -0,0 +1,44 @@
+{% extends "base.html" %}
+{% block title %}Debian duplication detector{% endblock %}
+{% block header %}
+ <script type="text/javascript">
+ function getLinkTarget() {
+ var pkg = document.getElementById("pkg_name").value;
+ if(pkg) {
+ return "/binary/"+pkg;
+ }
+ return '#';
+ }
+ function processData() {
+ var link = document.getElementById("perma_link");
+ link.href = getLinkTarget();
+ link.text = location.href + getLinkTarget();
+ }
+ window.onload = function() {
+ document.getElementById('pkg_name').onkeyup = processData;
+ document.getElementById("pkg_form").onsubmit = function () {
+ location.href = getLinkTarget();
+ return false;
+ }
+ processData();
+ document.getElementById("form_div").style.display = '';
+ }
+ </script>
+{% endblock %}
+{% block content %}
+<h1>Debian duplication detector</h1>
+<ul>
+<li>To inspect a particlar binary package, go to <pre>binary/&lt;packagename&gt;</pre> Example: <a href="binary/git">binary/git</a>
+ <div style="display:none" id="form_div"><fieldset>
+ <legend>Inspect package</legend>
+ <noscript><b>This form is dysfunctional when javascript is not enabled</b></noscript>
+ Enter binary package to inspect - Note: Non-existing packages will result in <b>404</b>-Errors
+ <form id="pkg_form">
+ <label for="pkg_name">Name: </label><input type="text" size="30" name="pkg_name" id="pkg_name">
+ <input type="submit" value="Go"> Permanent Link: <a id="perma_link" href="#"></a>
+ </form>
+ </fieldset></div></li>
+<li>To inspect a combination of binary packages go to <pre>compare/&lt;firstpackage&gt;/&lt;secondpackage&gt;</pre> Example: <a href="compare/git/git">compare/git/git</a></li>
+<li>To discover package shipping a particular file go to <pre>hash/sha512/&lt;hashvalue&gt;</pre> Example: <a href="hash/sha512/7633623b66b5e686bb94dd96a7cdb5a7e5ee00e87004fab416a5610d59c62badaf512a2e26e34e2455b7ed6b76690d2cd47464836d7d85d78b51d50f7e933d5c">hash/sha512/7633623b66b5e686bb94dd96a7cdb5a7e5ee00e87004fab416a5610d59c62badaf512a2e26e34e2455b7ed6b76690d2cd47464836d7d85d78b51d50f7e933d5c</a></li>
+</ul>
+{% endblock %}
diff --git a/dedup/templates/source.html b/dedup/templates/source.html
new file mode 100644
index 0000000..fc679b0
--- /dev/null
+++ b/dedup/templates/source.html
@@ -0,0 +1,15 @@
+{% extends "base.html" %}
+{% block title %}overview of {{ source|e }}{% endblock %}
+{% block content %}
+<h1>overview of {{ source|e }}</h1>
+<table border='1'><tr><th>binary from {{ source|e }}</th><th>savable</th><th>other package</th></tr>
+{%- for package, sharing in packages.items() -%}
+ <tr><td><a href="../binary/{{ package|e }}"><span class="binary-package">{{ package|e }}</span></a></td><td>
+ {%- if sharing -%}
+ {{ sharing.savable|filesizeformat }}</td><td><a href="../binary/{{ sharing.package|e }}"><span class="binary-package">{{ sharing.package|e }}</span></a> <a href="../compare/{{ package|e }}/{{ sharing.package|e }}">compare</a>
+ {%- else -%}</td><td>{%- endif -%}
+ </td></tr>
+{%- endfor -%}
+</table>
+<p>Note: Not all sharing listed here. Click on binary packages with non-zero savable to see more.</p>
+{% endblock %}