diff options
Diffstat (limited to 'dedup')
-rw-r--r-- | dedup/hashing.py | 19 | ||||
-rw-r--r-- | dedup/image.py | 67 | ||||
-rw-r--r-- | dedup/static/favicon.ico | bin | 0 -> 4286 bytes | |||
-rw-r--r-- | dedup/static/style.css | 12 | ||||
-rw-r--r-- | dedup/templates/base.html | 22 | ||||
-rw-r--r-- | dedup/templates/binary.html | 31 | ||||
-rw-r--r-- | dedup/templates/compare.html | 27 | ||||
-rw-r--r-- | dedup/templates/hash.html | 12 | ||||
-rw-r--r-- | dedup/templates/index.html | 44 | ||||
-rw-r--r-- | dedup/templates/source.html | 15 |
10 files changed, 227 insertions, 22 deletions
diff --git a/dedup/hashing.py b/dedup/hashing.py index 1283c7e..002eda8 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -106,3 +106,22 @@ def hash_file(hashobj, filelike, blocksize=65536): hashobj.update(data) data = filelike.read(blocksize) return hashobj + +class HashedStream(object): + """A file-like object, that supports sequential reading and hashes the + contents on the fly.""" + def __init__(self, filelike, hashobj): + """ + @param filelike: a file-like object, that must support the read method + @param hashobj: a hashlib-like object providing update and hexdigest + """ + self.filelike = filelike + self.hashobj = hashobj + + def read(self, length): + data = self.filelike.read(length) + self.hashobj.update(data) + return data + + def hexdigest(self): + return self.hashobj.hexdigest() diff --git a/dedup/image.py b/dedup/image.py index 1148890..c1f2de0 100644 --- a/dedup/image.py +++ b/dedup/image.py @@ -4,9 +4,10 @@ import struct import PIL.Image class ImageHash(object): - """A hash on the contents of an image. This disregards mode, depth and meta - information. Note that due to limitations in PIL and the image format - (interlacing) the full contents are stored and decoded in hexdigest.""" + """A hash on the contents of an image datat type supported by PIL. This + disregards mode, depth and meta information. Note that due to limitations + in PIL and the image format (interlacing) the full contents are stored and + decoded in hexdigest.""" maxsize = 1024 * 1024 * 32 # max memory usage is about 5 * maxpixels in bytes maxpixels = 1024 * 1024 * 32 @@ -19,33 +20,25 @@ class ImageHash(object): self.imagedetected = False self.content = io.BytesIO() + def detect(self): + raise NotImplementedError + def update(self, data): self.content.write(data) if self.content.tell() > self.maxsize: raise ValueError("maximum image size exceeded") - if self.imagedetected: - return - if self.content.tell() < 33: # header + IHDR - return - curvalue = self.content.getvalue() - if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"): - width, height = struct.unpack(">II", curvalue[16:24]) - if width * height > self.maxpixels: - raise ValueError("maximum image pixels exceeded") - self.imagedetected = True - return - raise ValueError("not a png image") + if not self.imagedetected: + self.imagedetected = self.detect() def copy(self): - new = ImageHash() - new.hashobj = self.hashobj.copy() + new = self.__class__(self.hashobj.copy()) new.imagedetected = self.imagedetected new.content = io.BytesIO(self.content.getvalue()) return new def hexdigest(self): if not self.imagedetected: - raise ValueError("not a png image") + raise ValueError("not a image") hashobj = self.hashobj.copy() pos = self.content.tell() try: @@ -53,7 +46,7 @@ class ImageHash(object): try: img = PIL.Image.open(self.content) except IOError: - raise ValueError("broken png header") + raise ValueError("broken header") width, height = img.size pack = lambda elem: struct.pack("BBBB", *elem) # special casing easy modes reduces memory usage @@ -64,13 +57,43 @@ class ImageHash(object): elif img.mode != "RGBA": try: img = img.convert("RGBA") - except (SyntaxError, IndexError, IOError): # crazy stuff from PIL - raise ValueError("error reading png image") + except (SyntaxError, IndexError, IOError): + # crazy stuff from PIL + raise ValueError("error reading image") try: for elem in img.getdata(): hashobj.update(pack(elem)) except (SyntaxError, IndexError, IOError): # crazy stuff from PIL - raise ValueError("error reading png image") + raise ValueError("error reading image") finally: self.content.seek(pos) return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height) + + +class PNGHash(ImageHash): + """A hash on the contents of a PNG image.""" + + def detect(self): + if self.content.tell() < 33: # header + IHDR + return False + curvalue = self.content.getvalue() + if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"): + width, height = struct.unpack(">II", curvalue[16:24]) + if width * height > self.maxpixels: + raise ValueError("maximum image pixels exceeded") + return True + raise ValueError("not a png image") + +class GIFHash(ImageHash): + """A hash on the contents of the first frame of a GIF image.""" + + def detect(self): + if self.content.tell() < 10: # magic + logical dimension + return False + curvalue = self.content.getvalue() + if curvalue.startswith((b"GIF87a", "GIF89a")): + width, height = struct.unpack("<HH", curvalue[6:10]) + if width * height > self.maxpixels: + raise ValueError("maximum image pixels exceeded") + return True + raise ValueError("not a png image") diff --git a/dedup/static/favicon.ico b/dedup/static/favicon.ico Binary files differnew file mode 100644 index 0000000..5039835 --- /dev/null +++ b/dedup/static/favicon.ico diff --git a/dedup/static/style.css b/dedup/static/style.css new file mode 100644 index 0000000..531ef9d --- /dev/null +++ b/dedup/static/style.css @@ -0,0 +1,12 @@ +.dependency { + background-color: yellow; +} +td { + vertical-align: top; +} +.filename { + display: block; +} +.filename:hover { + background-color: #eee; +} diff --git a/dedup/templates/base.html b/dedup/templates/base.html new file mode 100644 index 0000000..62f4087 --- /dev/null +++ b/dedup/templates/base.html @@ -0,0 +1,22 @@ +<!DOCTYPE html> +<html> + <head> + <title>{% block title %}{% endblock %}</title> + <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> + <link rel="stylesheet" type="text/css" href="{{ urlroot|e }}/style.css"> + <link rel="icon" type="image/vnd.microsoft.icon" href="{{ urlroot|e }}/favicon.ico"> + {% block header %}{% endblock %} + </head> + <body> +{% block content %} +{% endblock %} +<hr> +<h4>Details about this service</h4> +<ul> + <li>More information: <a href="https://wiki.debian.org/dedup.debian.net">see wiki</a></li> + <li>Maintainer: Helmut Grohne <helmut@subdivi.de></li> + <li>Source: git://murkel.subdivi.de/~helmut/debian-dedup.git</li> + <li>Bugs reports / Feedback / Patches: to the maintainer</li> +</ul> + </body> +</html> diff --git a/dedup/templates/binary.html b/dedup/templates/binary.html new file mode 100644 index 0000000..69eceef --- /dev/null +++ b/dedup/templates/binary.html @@ -0,0 +1,31 @@ +{% extends "base.html" %} +{% block title %}duplication of {{ package|e }}{% endblock %} +{% block content %}<h1>{{ package|e }}</h1> +<p>Version: {{ version|e }}</p> +<p>Architecture: {{ architecture|e }}</p> +<p>Number of files: {{ num_files }}</p> +<p>Total size: {{ total_size|filesizeformat }}</p> +{%- if shared -%} + {%- for function, sharing in shared.items() -%} + <h3>sharing with respect to {{ function|e }}</h3> + <table border='1'><tr><th>package</th><th>files shared</th><th>data shared</th></tr> + {%- for entry in sharing|sort(attribute="savable", reverse=true) -%} + <tr><td{% if not entry.package or entry.package in dependencies %} class="dependency"{% endif %}> + {%- if entry.package %}<a href="{{ entry.package|e }}"><span class="binary-package">{{ entry.package|e }}</span></a>{% else %}self{% endif %} + <a href="../compare/{{ package|e }}/{{ entry.package|default(package, true)|e }}">compare</a></td> + <td>{{ entry.duplicate }} ({{ (100 * entry.duplicate / num_files)|int }}%)</td> + <td>{{ entry.savable|filesizeformat }} ({{ (100 * entry.savable / total_size)|int }}%)</td></tr> + {%- endfor -%} + </table> + {%- endfor -%} +<p>Note: Packages with yellow background are required to be installed when this package is installed.</p> +{%- endif -%} +{%- if issues -%} + <h3>issues with particular files</h3> + <table border='1'><tr><th>filename</th><th>issue</th></tr> + {%- for filename, issue in issues|dictsort(true) -%} + <tr><td><span class="filename">{{ filename|e }}</span></td><td>{{ issue|e }}</td></tr> + {%- endfor -%} + </table> +{%- endif -%} +{% endblock %} diff --git a/dedup/templates/compare.html b/dedup/templates/compare.html new file mode 100644 index 0000000..f78e80f --- /dev/null +++ b/dedup/templates/compare.html @@ -0,0 +1,27 @@ +{% extends "base.html" %} +{% block title %}sharing between {{ details1.package|e }} and {{ details2.package|e }}{% endblock%} +{% block content %} +<h1><a href="../../binary/{{ details1.package|e }}">{{ details1.package|e }}</a> <-> <a href="../../binary/{{ details2.package|e }}">{{ details2.package|e }}</a></h1> +<p>Version of {{ details1.package|e }}: {{ details1.version|e }}</p> +<p>Architecture of {{ details1.package|e }}: {{ details1.architecture|e }}</p> +{%- if details1.package != details2.package -%} +<p>Version of {{ details2.package|e }}: {{ details2.version|e }}</p> +<p>Architecture of {{ details2.package|e }}: {{ details2.architecture|e }}</p> +{%- endif -%} +<table border='1'><tr><th colspan="2">{{ details1.package|e }}</th><th colspan="2">{{ details2.package|e }}</th></tr> +<tr><th>size</th><th>filename</th><th>hash functions</th><th>filename</th></tr> +{%- for entry in shared -%} + <tr><td{% if entry.matches|length > 1 %} rowspan={{ entry.matches|length }}{% endif %}>{{ entry.size|filesizeformat }}</td><td{% if entry.matches|length > 1 %} rowspan={{ entry.matches|length }}{% endif %}> + {%- for filename in entry.filenames %}<span class="filename">{{ filename|e }}</span>{% endfor -%}</td><td> + {% for filename, match in entry.matches.items() -%} + {% if not loop.first %}<tr><td>{% endif -%} + {%- for funccomb, hashvalue in match.items() -%} + <a href="../../hash/{{ funccomb[0]|e }}/{{ hashvalue|e }}">{{ funccomb[0]|e }}</a> + {%- if funccomb[0] != funccomb[1] %} -> <a href="../../hash/{{ funccomb[1]|e }}/{{ hashvalue|e }}">{{ funccomb[1]|e }}</a>{% endif %} + {%- if not loop.last %}, {% endif %} + {%- endfor -%} + </td><td><span class="filename">{{ filename|e }}</span></td></tr> + {%- endfor -%} +{%- endfor -%} +</table> +{% endblock %} diff --git a/dedup/templates/hash.html b/dedup/templates/hash.html new file mode 100644 index 0000000..7141f96 --- /dev/null +++ b/dedup/templates/hash.html @@ -0,0 +1,12 @@ +{% extends "base.html" %} +{% block title %}information on {{ function|e }} hash {{ hashvalue|e }}{% endblock %} +{% block content %} +<h1>{{ function|e }} {{ hashvalue|e }}</h1> +<table border='1'><tr><th>package</th><th>filename</th><th>size</th><th>different function</th></tr> +{%- for entry in entries -%} + <tr><td><a href="../../binary/{{ entry.package|e }}"><span class="binary-package">{{ entry.package|e }}</span></a></td> + <td><span class="filename">{{ entry.filename|e }}</span></td><td>{{ entry.size|filesizeformat }}</td> + <td>{% if function != entry.function %}{{ entry.function|e }}{% endif %}</td></tr> +{%- endfor -%} +</table> +{% endblock %} diff --git a/dedup/templates/index.html b/dedup/templates/index.html new file mode 100644 index 0000000..7c9000f --- /dev/null +++ b/dedup/templates/index.html @@ -0,0 +1,44 @@ +{% extends "base.html" %} +{% block title %}Debian duplication detector{% endblock %} +{% block header %} + <script type="text/javascript"> + function getLinkTarget() { + var pkg = document.getElementById("pkg_name").value; + if(pkg) { + return "/binary/"+pkg; + } + return '#'; + } + function processData() { + var link = document.getElementById("perma_link"); + link.href = getLinkTarget(); + link.text = location.href + getLinkTarget(); + } + window.onload = function() { + document.getElementById('pkg_name').onkeyup = processData; + document.getElementById("pkg_form").onsubmit = function () { + location.href = getLinkTarget(); + return false; + } + processData(); + document.getElementById("form_div").style.display = ''; + } + </script> +{% endblock %} +{% block content %} +<h1>Debian duplication detector</h1> +<ul> +<li>To inspect a particlar binary package, go to <pre>binary/<packagename></pre> Example: <a href="binary/git">binary/git</a> + <div style="display:none" id="form_div"><fieldset> + <legend>Inspect package</legend> + <noscript><b>This form is dysfunctional when javascript is not enabled</b></noscript> + Enter binary package to inspect - Note: Non-existing packages will result in <b>404</b>-Errors + <form id="pkg_form"> + <label for="pkg_name">Name: </label><input type="text" size="30" name="pkg_name" id="pkg_name"> + <input type="submit" value="Go"> Permanent Link: <a id="perma_link" href="#"></a> + </form> + </fieldset></div></li> +<li>To inspect a combination of binary packages go to <pre>compare/<firstpackage>/<secondpackage></pre> Example: <a href="compare/git/git">compare/git/git</a></li> +<li>To discover package shipping a particular file go to <pre>hash/sha512/<hashvalue></pre> Example: <a href="hash/sha512/7633623b66b5e686bb94dd96a7cdb5a7e5ee00e87004fab416a5610d59c62badaf512a2e26e34e2455b7ed6b76690d2cd47464836d7d85d78b51d50f7e933d5c">hash/sha512/7633623b66b5e686bb94dd96a7cdb5a7e5ee00e87004fab416a5610d59c62badaf512a2e26e34e2455b7ed6b76690d2cd47464836d7d85d78b51d50f7e933d5c</a></li> +</ul> +{% endblock %} diff --git a/dedup/templates/source.html b/dedup/templates/source.html new file mode 100644 index 0000000..fc679b0 --- /dev/null +++ b/dedup/templates/source.html @@ -0,0 +1,15 @@ +{% extends "base.html" %} +{% block title %}overview of {{ source|e }}{% endblock %} +{% block content %} +<h1>overview of {{ source|e }}</h1> +<table border='1'><tr><th>binary from {{ source|e }}</th><th>savable</th><th>other package</th></tr> +{%- for package, sharing in packages.items() -%} + <tr><td><a href="../binary/{{ package|e }}"><span class="binary-package">{{ package|e }}</span></a></td><td> + {%- if sharing -%} + {{ sharing.savable|filesizeformat }}</td><td><a href="../binary/{{ sharing.package|e }}"><span class="binary-package">{{ sharing.package|e }}</span></a> <a href="../compare/{{ package|e }}/{{ sharing.package|e }}">compare</a> + {%- else -%}</td><td>{%- endif -%} + </td></tr> +{%- endfor -%} +</table> +<p>Note: Not all sharing listed here. Click on binary packages with non-zero savable to see more.</p> +{% endblock %} |