summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README4
-rwxr-xr-xautoimport.py12
-rw-r--r--dedup/image.py67
-rwxr-xr-ximportpkg.py16
-rwxr-xr-xwebapp.py45
5 files changed, 126 insertions, 18 deletions
diff --git a/README b/README
index 3f4af93..aff9868 100644
--- a/README
+++ b/README
@@ -1,7 +1,7 @@
Required packages
-----------------
-aptitude install python python-debian python-lzma curl python-jinja2 python-werkzeug sqlite3
+aptitude install python python-debian python-lzma curl python-jinja2 python-werkzeug sqlite3 python-imaging
Create a database
-----------------
@@ -12,7 +12,7 @@ Import packages
---------------
Import individual packages by feeding them to importpkg.py:
- ls -t /var/cache/apt/archives/*.deb | while read f; echo $f; ./importpkg.py < $f || break; done
+ ls -t /var/cache/apt/archives/*.deb | while read f; do echo $f; ./importpkg.py < $f || break; done
Import a full mirror::
diff --git a/autoimport.py b/autoimport.py
index 6ce7146..a7ea5ff 100755
--- a/autoimport.py
+++ b/autoimport.py
@@ -8,6 +8,7 @@ import sys
import urllib
from debian import deb822
+from debian.debian_support import version_compare
def main():
urlbase = sys.argv[1]
@@ -22,7 +23,8 @@ def main():
for pkg in deb822.Packages.iter_paragraphs(io.BytesIO(pkglist)):
name = pkg["Package"]
distpkgs.add(name)
- if pkg["Version"] == knownpkgs.get(name, ()):
+ if name in knownpkgs and \
+ version_compare(pkg["Version"], knownpkgs[name]) <= 0:
continue
pkgurl = "%s/%s" % (urlbase, pkg["Filename"])
print("importing %s" % name)
@@ -33,9 +35,15 @@ def main():
if dl.wait():
print("curl failed")
+ delpkgs = set(knownpkgs) - distpkgs
+ print("clearing packages %s" % " ".join(delpkgs))
cur.execute("PRAGMA foreign_keys=1;")
+ cur.executemany("DELETE FROM content WHERE package = ?;",
+ ((pkg,) for pkg in delpkgs))
+ cur.executemany("DELETE FROM dependency WHERE package = ?;",
+ ((pkg,) for pkg in delpkgs))
cur.executemany("DELETE FROM package WHERE package = ?;",
- ((pkg,) for pkg in set(knownpkgs) - distpkgs))
+ ((pkg,) for pkg in delpkgs))
db.commit()
if __name__ == "__main__":
diff --git a/dedup/image.py b/dedup/image.py
new file mode 100644
index 0000000..e05e7da
--- /dev/null
+++ b/dedup/image.py
@@ -0,0 +1,67 @@
+import io
+import struct
+
+import PIL.Image
+
+class ImageHash(object):
+ """A hash on the contents of an image. This disregards mode, depth and meta
+ information. Note that due to limitations in PIL and the image format
+ (interlacing) the full contents are stored and decoded in hexdigest."""
+ maxsize = 1024 * 1024 * 32
+ # max memory usage is about 5 * maxpixels in bytes
+ maxpixels = 1024 * 1024 * 32
+
+ def __init__(self, hashobj):
+ """
+ @param hashobj: a hashlib-like object
+ """
+ self.hashobj = hashobj
+ self.imagedetected = False
+ self.content = io.BytesIO()
+
+ def update(self, data):
+ self.content.write(data)
+ if self.content.tell() > self.maxsize:
+ raise ValueError("maximum image size exceeded")
+ if self.imagedetected:
+ return
+ if self.content.tell() < 33: # header + IHDR
+ return
+ curvalue = self.content.getvalue()
+ if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"):
+ width, height = struct.unpack(">II", curvalue[16:24])
+ if width * height > self.maxpixels:
+ raise ValueError("maximum image pixels exceeded")
+ self.imagedetected = True
+ return
+ raise ValueError("not a png image")
+
+ def copy(self):
+ new = ImageHash()
+ new.hashobj = self.hashobj.copy()
+ new.imagedetected = self.imagedetected
+ new.content = io.BytesIO(self.content.getvalue())
+ return new
+
+ def hexdigest(self):
+ if not self.imagedetected:
+ raise ValueError("not a png image")
+ hashobj = self.hashobj.copy()
+ pos = self.content.tell()
+ try:
+ self.content.seek(0)
+ img = PIL.Image.open(self.content)
+ width, height = img.size
+ pack = lambda elem: struct.pack("BBBB", *elem)
+ # special casing easy modes reduces memory usage
+ if img.mode == "L":
+ pack = lambda elem: struct.pack("BBBB", elem, elem, elem, 255)
+ elif img.mode == "RGB":
+ pack = lambda elem: struct.pack("BBBB", *(elem + (255,)))
+ elif img.mode != "RGBA":
+ img = img.convert("RGBA")
+ for elem in img.getdata():
+ hashobj.update(pack(elem))
+ finally:
+ self.content.seek(pos)
+ return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height)
diff --git a/importpkg.py b/importpkg.py
index 89020b9..d626fba 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -20,6 +20,7 @@ import lzma
from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, hash_file
from dedup.compression import GzipDecompressor, DecompressedStream
+from dedup.image import ImageHash
class ArReader(object):
global_magic = b"!<arch>\n"
@@ -96,11 +97,17 @@ def gziphash():
hashobj.name = "gzip_sha512"
return HashBlacklist(hashobj, boring_sha512_hashes)
+def imagehash():
+ hashobj = ImageHash(hashlib.sha512())
+ hashobj = SuppressingHash(hashobj, (ValueError,))
+ hashobj.name = "image_sha512"
+ return hashobj
+
def get_hashes(tar):
for elem in tar:
if not elem.isreg(): # excludes hard links as well
continue
- hasher = MultiHash(sha512_nontrivial(), gziphash())
+ hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash())
hasher = hash_file(hasher, tar.extractfile(elem))
for hashobj in hasher.hashes:
hashvalue = hashobj.hexdigest()
@@ -167,8 +174,13 @@ def process_package(db, filelike):
if state != "control_file":
raise ValueError("missing control file")
for name, size, function, hexhash in get_hashes(tf):
+ try:
+ name = name.decode("utf8")
+ except UnicodeDecodeError:
+ print("warning: skipping filename with encoding error")
+ continue # skip files with non-utf8 encoding for now
cur.execute("INSERT INTO content (package, filename, size, function, hash) VALUES (?, ?, ?, ?, ?);",
- (package, name.decode("utf8"), size, function, hexhash))
+ (package, name, size, function, hexhash))
db.commit()
return
raise ValueError("data.tar not found")
diff --git a/webapp.py b/webapp.py
index 06aa5d6..a215dd5 100755
--- a/webapp.py
+++ b/webapp.py
@@ -1,5 +1,6 @@
#!/usr/bin/python
+import datetime
import sqlite3
from wsgiref.simple_server import make_server
@@ -10,6 +11,7 @@ from werkzeug.wrappers import Request, Response
hash_functions = [
("sha512", "sha512"),
+ ("image_sha512", "image_sha512"),
("gzip_sha512", "gzip_sha512"),
("sha512", "gzip_sha512"),
("gzip_sha512", "sha512")]
@@ -104,6 +106,29 @@ index_template = jinjaenv.from_string(
</ul>
{% endblock %}""")
+def fetchiter(cursor):
+ rows = cursor.fetchmany()
+ while rows:
+ for row in rows:
+ yield row
+ rows = cursor.fetchmany()
+
+def encode_and_buffer(iterator):
+ buff = b""
+ for elem in iterator:
+ buff += elem.encode("utf8")
+ if len(buff) >= 2048:
+ yield buff
+ buff = b""
+ if buff:
+ yield buff
+
+def html_response(unicode_iterator, max_age=24 * 60 * 60):
+ resp = Response(encode_and_buffer(unicode_iterator), mimetype="text/html")
+ resp.cache_control.max_age = max_age
+ resp.expires = datetime.datetime.now() + datetime.timedelta(seconds=max_age)
+ return resp
+
class Application(object):
def __init__(self):
self.db = sqlite3.connect("test.sqlite3")
@@ -129,8 +154,7 @@ class Application(object):
elif endpoint == "index":
if not request.environ["PATH_INFO"]:
raise RequestRedirect(request.environ["SCRIPT_NAME"] + "/")
- return Response(index_template.render().encode("utf8"),
- content_type="text/html")
+ return html_response(index_template.stream())
raise NotFound()
except HTTPException as e:
return e
@@ -154,7 +178,7 @@ class Application(object):
def get_dependencies(self, package):
self.cur.execute("SELECT required FROM dependency WHERE package = ?;",
(package,))
- return set(row[0] for row in self.cur.fetchall())
+ return set(row[0] for row in fetchiter(self.cur))
def show_package(self, package):
params = self.get_details(package)
@@ -165,7 +189,7 @@ class Application(object):
self.cur.execute("SELECT a.filename, a.hash, a.size, b.package FROM content AS a JOIN content AS b ON a.hash = b.hash WHERE a.package = ? AND a.function = ? AND b.function = ? AND (a.filename != b.filename OR b.package != ?);",
(package, func1, func2, package))
sharing = dict()
- for afile, hashval, size, bpkg in self.cur.fetchall():
+ for afile, hashval, size, bpkg in fetchiter(self.cur):
hashdict = sharing.setdefault(bpkg, dict())
fileset = hashdict.setdefault(hashval, (size, set()))[1]
fileset.add(afile)
@@ -186,8 +210,7 @@ class Application(object):
curstats.append(dict(package=pkg, duplicate=duplicate, savable=savable))
params["shared"] = sharedstats
- return Response(package_template.render(**params).encode("utf8"),
- content_type="text/html")
+ return html_response(package_template.render(params))
def show_detail(self, package1, package2):
if package1 == package2:
@@ -203,7 +226,7 @@ class Application(object):
(package1, package2))
shared = dict()
- for filename1, size1, func1, filename2, size2, func2, hashvalue in self.cur.fetchall():
+ for filename1, size1, func1, filename2, size2, func2, hashvalue in fetchiter(self.cur):
funccomb = (func1, func2)
if funccomb not in hash_functions:
continue
@@ -218,21 +241,19 @@ class Application(object):
details1=details1,
details2=details2,
shared=shared)
- return Response(detail_template.render(**params).encode("utf8"),
- content_type="text/html")
+ return html_response(detail_template.render(params))
def show_hash(self, function, hashvalue):
self.cur.execute("SELECT package, filename, size, function FROM content WHERE hash = ?;",
(hashvalue,))
entries = [dict(package=package, filename=filename, size=size,
function=otherfunc)
- for package, filename, size, otherfunc in self.cur.fetchall()
+ for package, filename, size, otherfunc in fetchiter(self.cur)
if (function, otherfunc) in hash_functions]
if not entries:
raise NotFound()
params = dict(function=function, hashvalue=hashvalue, entries=entries)
- return Response(hash_template.render(**params).encode("utf8"),
- content_type="text/html")
+ return html_response(hash_template.render(params))
def main():
app = Application()