From 2f797c9b90f05eadf4bb13f4a9c1f029925d9275 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Tue, 23 Jul 2013 21:54:41 +0200 Subject: adapt queries in README to new schema --- README | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'README') diff --git a/README b/README index 44b086a..b0e06f3 100644 --- a/README +++ b/README @@ -43,12 +43,12 @@ Finding the 100 largest files shared with multiple packages. Finding those top 100 files that save most space when being reduced to only one copy in the archive. - SELECT hash, sum(size)-min(size), count(*), count(distinct pid) FROM content JOIN hash ON content.id = hash.cid WHERE hash.function = "sha512" GROUP BY hash ORDER BY sum(size)-min(size) DESC LIMIT 100; + SELECT hash, sum(size)-min(size), count(*), count(distinct pid) FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = "sha512" GROUP BY hash ORDER BY sum(size)-min(size) DESC LIMIT 100; Finding PNG images that do not carry a .png file extension. - SELECT package.name, content.filename, content.size FROM content JOIN hash ON content.id = hash.cid JOIN package ON content.pid = package.id WHERE function = "image_sha512" AND filename NOT LIKE "%.png"; + SELECT package.name, content.filename, content.size FROM content JOIN hash ON content.id = hash.cid JOIN package ON content.pid = package.id JOIN function ON hash.fid = function.id WHERE function.name = "image_sha512" AND filename NOT LIKE "%.png"; Finding .gz files which either are not gziped or contain errors. - SELECT package.name, content.filename FROM content JOIN package ON content.pid = package.id WHERE filename LIKE "%.gz" AND (SELECT count(*) FROM hash WHERE hash.cid = content.id AND hash.function = "gzip_sha512") = 0; + SELECT package.name, content.filename FROM content JOIN package ON content.pid = package.id WHERE filename LIKE "%.gz" AND (SELECT count(*) FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = "gzip_sha512") = 0; -- cgit v1.2.3 From 1e50900862fe8887755597d85483dbc845ccb5e3 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Tue, 23 Jul 2013 23:26:52 +0200 Subject: README: fix typo in query --- README | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'README') diff --git a/README b/README index 44b086a..a023d0a 100644 --- a/README +++ b/README @@ -38,7 +38,7 @@ SQL database by hand. Here are some example queries. Finding the 100 largest files shared with multiple packages. - SELECT pa.name, a.filename, pb.name, b.filename, a.size FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hash = hb.hash JOIN content AS b ON b.id = hb.cid JOIN package AS pa ON b.pid = pa.id JOIN package AS pb ON b.pid = pb.id WHERE (a.pid != b.pid OR a.filename != b.filename) ORDER BY a.size DESC LIMIT 100; + SELECT pa.name, a.filename, pb.name, b.filename, a.size FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hash = hb.hash JOIN content AS b ON b.id = hb.cid JOIN package AS pa ON a.pid = pa.id JOIN package AS pb ON b.pid = pb.id WHERE (a.pid != b.pid OR a.filename != b.filename) ORDER BY a.size DESC LIMIT 100; Finding those top 100 files that save most space when being reduced to only one copy in the archive. -- cgit v1.2.3 From 796eeb217e449234b777512451c5b668837c9118 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Thu, 25 Jul 2013 12:48:45 +0200 Subject: README: foo.PNG is also a valid png name --- README | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'README') diff --git a/README b/README index a023d0a..c749a86 100644 --- a/README +++ b/README @@ -47,7 +47,7 @@ one copy in the archive. Finding PNG images that do not carry a .png file extension. - SELECT package.name, content.filename, content.size FROM content JOIN hash ON content.id = hash.cid JOIN package ON content.pid = package.id WHERE function = "image_sha512" AND filename NOT LIKE "%.png"; + SELECT package.name, content.filename, content.size FROM content JOIN hash ON content.id = hash.cid JOIN package ON content.pid = package.id WHERE function = "image_sha512" AND lower(filename) NOT LIKE "%.png"; Finding .gz files which either are not gziped or contain errors. -- cgit v1.2.3 From 0c27c95a9c55b82b2c7e5e90b885c87578e895d0 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sat, 27 Jul 2013 09:32:03 +0200 Subject: move templates to dedup package They cluttered webapp.py and now vim can give proper highlighting for the templates. --- README | 2 +- base.html | 22 ------- dedup/templates/base.html | 22 +++++++ dedup/templates/binary.html | 31 +++++++++ dedup/templates/compare.html | 27 ++++++++ dedup/templates/hash.html | 12 ++++ dedup/templates/index.html | 44 +++++++++++++ dedup/templates/source.html | 15 +++++ webapp.py | 146 ++----------------------------------------- 9 files changed, 158 insertions(+), 163 deletions(-) delete mode 100644 base.html create mode 100644 dedup/templates/base.html create mode 100644 dedup/templates/binary.html create mode 100644 dedup/templates/compare.html create mode 100644 dedup/templates/hash.html create mode 100644 dedup/templates/index.html create mode 100644 dedup/templates/source.html (limited to 'README') diff --git a/README b/README index ef0ae48..a84807a 100644 --- a/README +++ b/README @@ -1,7 +1,7 @@ Required packages ----------------- - aptitude install python python-debian python-lzma curl python-jinja2 python-werkzeug sqlite3 python-imaging python-yaml python-concurrent.futures + aptitude install python python-debian python-lzma curl python-jinja2 python-werkzeug sqlite3 python-imaging python-yaml python-concurrent.futures python-pkg-resources Create a database ----------------- diff --git a/base.html b/base.html deleted file mode 100644 index 4e49d47..0000000 --- a/base.html +++ /dev/null @@ -1,22 +0,0 @@ - - - - {% block title %}{% endblock %} - - - - {% block header %}{% endblock %} - - -{% block content %} -{% endblock %} -
-

Details about this service

-
    -
  • More information: see wiki
  • -
  • Maintainer: Helmut Grohne <helmut@subdivi.de>
  • -
  • Source: git://murkel.subdivi.de/~helmut/debian-dedup.git
  • -
  • Bugs reports / Feedback / Patches: to the maintainer
  • -
- - diff --git a/dedup/templates/base.html b/dedup/templates/base.html new file mode 100644 index 0000000..4e49d47 --- /dev/null +++ b/dedup/templates/base.html @@ -0,0 +1,22 @@ + + + + {% block title %}{% endblock %} + + + + {% block header %}{% endblock %} + + +{% block content %} +{% endblock %} +
+

Details about this service

+
    +
  • More information: see wiki
  • +
  • Maintainer: Helmut Grohne <helmut@subdivi.de>
  • +
  • Source: git://murkel.subdivi.de/~helmut/debian-dedup.git
  • +
  • Bugs reports / Feedback / Patches: to the maintainer
  • +
+ + diff --git a/dedup/templates/binary.html b/dedup/templates/binary.html new file mode 100644 index 0000000..59c910c --- /dev/null +++ b/dedup/templates/binary.html @@ -0,0 +1,31 @@ +{% extends "base.html" %} +{% block title %}duplication of {{ package|e }}{% endblock %} +{% block content %}

{{ package|e }}

+

Version: {{ version|e }}

+

Architecture: {{ architecture|e }}

+

Number of files: {{ num_files }}

+

Total size: {{ total_size|filesizeformat }}

+{%- if shared -%} + {%- for function, sharing in shared.items() -%} +

sharing with respect to {{ function|e }}

+ + {%- for entry in sharing|sort(attribute="savable", reverse=true) -%} + + {%- if entry.package %}{{ entry.package|e }}{% else %}self{% endif -%} + compare + + + {%- endfor -%} +
packagefiles shareddata shared
{{ entry.duplicate }} ({{ (100 * entry.duplicate / num_files)|int }}%){{ entry.savable|filesizeformat }} ({{ (100 * entry.savable / total_size)|int }}%)
+ {%- endfor -%} +

Note: Packages with yellow background are required to be installed when this package is installed.

+{%- endif -%} +{%- if issues -%} +

issues with particular files

+ + {%- for filename, issue in issues|dictsort(true) -%} + + {%- endfor -%} +
filenameissue
{{ filename|e }}{{ issue|e }}
+{%- endif -%} +{% endblock %} diff --git a/dedup/templates/compare.html b/dedup/templates/compare.html new file mode 100644 index 0000000..f78e80f --- /dev/null +++ b/dedup/templates/compare.html @@ -0,0 +1,27 @@ +{% extends "base.html" %} +{% block title %}sharing between {{ details1.package|e }} and {{ details2.package|e }}{% endblock%} +{% block content %} +

{{ details1.package|e }} <-> {{ details2.package|e }}

+

Version of {{ details1.package|e }}: {{ details1.version|e }}

+

Architecture of {{ details1.package|e }}: {{ details1.architecture|e }}

+{%- if details1.package != details2.package -%} +

Version of {{ details2.package|e }}: {{ details2.version|e }}

+

Architecture of {{ details2.package|e }}: {{ details2.architecture|e }}

+{%- endif -%} + + +{%- for entry in shared -%} + 1 %} rowspan={{ entry.matches|length }}{% endif %}>{{ entry.size|filesizeformat }} 1 %} rowspan={{ entry.matches|length }}{% endif %}> + {%- for filename in entry.filenames %}{{ filename|e }}{% endfor -%} + {%- endfor -%} +{%- endfor -%} +
{{ details1.package|e }}{{ details2.package|e }}
sizefilenamehash functionsfilename
+ {% for filename, match in entry.matches.items() -%} + {% if not loop.first %}
{% endif -%} + {%- for funccomb, hashvalue in match.items() -%} + {{ funccomb[0]|e }} + {%- if funccomb[0] != funccomb[1] %} -> {{ funccomb[1]|e }}{% endif %} + {%- if not loop.last %}, {% endif %} + {%- endfor -%} + {{ filename|e }}
+{% endblock %} diff --git a/dedup/templates/hash.html b/dedup/templates/hash.html new file mode 100644 index 0000000..7141f96 --- /dev/null +++ b/dedup/templates/hash.html @@ -0,0 +1,12 @@ +{% extends "base.html" %} +{% block title %}information on {{ function|e }} hash {{ hashvalue|e }}{% endblock %} +{% block content %} +

{{ function|e }} {{ hashvalue|e }}

+ +{%- for entry in entries -%} + + + +{%- endfor -%} +
packagefilenamesizedifferent function
{{ entry.package|e }}{{ entry.filename|e }}{{ entry.size|filesizeformat }}{% if function != entry.function %}{{ entry.function|e }}{% endif %}
+{% endblock %} diff --git a/dedup/templates/index.html b/dedup/templates/index.html new file mode 100644 index 0000000..7c9000f --- /dev/null +++ b/dedup/templates/index.html @@ -0,0 +1,44 @@ +{% extends "base.html" %} +{% block title %}Debian duplication detector{% endblock %} +{% block header %} + +{% endblock %} +{% block content %} +

Debian duplication detector

+ +{% endblock %} diff --git a/dedup/templates/source.html b/dedup/templates/source.html new file mode 100644 index 0000000..fc679b0 --- /dev/null +++ b/dedup/templates/source.html @@ -0,0 +1,15 @@ +{% extends "base.html" %} +{% block title %}overview of {{ source|e }}{% endblock %} +{% block content %} +

overview of {{ source|e }}

+ +{%- for package, sharing in packages.items() -%} + +{%- endfor -%} +
binary from {{ source|e }}savableother package
{{ package|e }} + {%- if sharing -%} + {{ sharing.savable|filesizeformat }}{{ sharing.package|e }} compare + {%- else -%}{%- endif -%} +
+

Note: Not all sharing listed here. Click on binary packages with non-zero savable to see more.

+{% endblock %} diff --git a/webapp.py b/webapp.py index c080d41..e180087 100755 --- a/webapp.py +++ b/webapp.py @@ -20,7 +20,7 @@ hash_functions = [ ("sha512", "gzip_sha512"), ("gzip_sha512", "sha512")] -jinjaenv = jinja2.Environment(loader=jinja2.FileSystemLoader(".")) +jinjaenv = jinja2.Environment(loader=jinja2.PackageLoader("dedup", "templates")) def format_size(size): size = float(size) @@ -45,145 +45,11 @@ def function_combination(function1, function2): jinjaenv.filters["filesizeformat"] = format_size base_template = jinjaenv.get_template("base.html") - -package_template = jinjaenv.from_string( -"""{% extends "base.html" %} -{% block title %}duplication of {{ package|e }}{% endblock %} -{% block content %}

{{ package|e }}

-

Version: {{ version|e }}

-

Architecture: {{ architecture|e }}

-

Number of files: {{ num_files }}

-

Total size: {{ total_size|filesizeformat }}

-{%- if shared -%} - {%- for function, sharing in shared.items() -%} -

sharing with respect to {{ function|e }}

- - {%- for entry in sharing|sort(attribute="savable", reverse=true) -%} - - {%- if entry.package %}{{ entry.package|e }}{% else %}self{% endif %} - compare - - - {%- endfor -%} -
packagefiles shareddata shared
{{ entry.duplicate }} ({{ (100 * entry.duplicate / num_files)|int }}%){{ entry.savable|filesizeformat }} ({{ (100 * entry.savable / total_size)|int }}%)
- {%- endfor -%} -

Note: Packages with yellow background are required to be installed when this package is installed.

-{%- endif -%} -{%- if issues -%} -

issues with particular files

- - {%- for filename, issue in issues|dictsort(true) -%} - - {%- endfor -%} -
filenameissue
{{ filename|e }}{{ issue|e }}
-{%- endif -%} -{% endblock %}""") - -detail_template = jinjaenv.from_string( -"""{% extends "base.html" %} -{% block title %}sharing between {{ details1.package|e }} and {{ details2.package|e }}{% endblock%} -{% block content %} -

{{ details1.package|e }} <-> {{ details2.package|e }}

-

Version of {{ details1.package|e }}: {{ details1.version|e }}

-

Architecture of {{ details1.package|e }}: {{ details1.architecture|e }}

-{%- if details1.package != details2.package -%} -

Version of {{ details2.package|e }}: {{ details2.version|e }}

-

Architecture of {{ details2.package|e }}: {{ details2.architecture|e }}

-{%- endif -%} - - -{%- for entry in shared -%} - 1 %} rowspan={{ entry.matches|length }}{% endif %}>{{ entry.size|filesizeformat }} 1 %} rowspan={{ entry.matches|length }}{% endif %}> - {%- for filename in entry.filenames %}{{ filename|e }}{% endfor -%} - {%- endfor -%} -{%- endfor -%} -
{{ details1.package|e }}{{ details2.package|e }}
sizefilenamehash functionsfilename
- {% for filename, match in entry.matches.items() -%} - {% if not loop.first %}
{% endif -%} - {%- for funccomb, hashvalue in match.items() -%} - {{ funccomb[0]|e }} - {%- if funccomb[0] != funccomb[1] %} -> {{ funccomb[1]|e }}{% endif %} - {%- if not loop.last %}, {% endif %} - {%- endfor -%} - {{ filename|e }}
-{% endblock %}""") - -hash_template = jinjaenv.from_string( -"""{% extends "base.html" %} -{% block title %}information on {{ function|e }} hash {{ hashvalue|e }}{% endblock %} -{% block content %} -

{{ function|e }} {{ hashvalue|e }}

- -{%- for entry in entries -%} - - - -{%- endfor -%} -
packagefilenamesizedifferent function
{{ entry.package|e }}{{ entry.filename|e }}{{ entry.size|filesizeformat }}{% if function != entry.function %}{{ entry.function|e }}{% endif %}
-{% endblock %}""") - -index_template = jinjaenv.from_string( -"""{% extends "base.html" %} -{% block title %}Debian duplication detector{% endblock %} -{% block header %} - -{% endblock %} -{% block content %} -

Debian duplication detector

- -{% endblock %}""") - -source_template = jinjaenv.from_string( -"""{% extends "base.html" %} -{% block title %}overview of {{ source|e }}{% endblock %} -{% block content %} -

overview of {{ source|e }}

- -{% for package, sharing in packages.items() %} - -{% endfor %} -
binary from {{ source|e }}savableother package
{{ package|e }} - {%- if sharing -%} - {{ sharing.savable|filesizeformat }}{{ sharing.package|e }} compare - {%- else -%}{%- endif -%} -
-

Note: Not all sharing listed here. Click on binary packages with non-zero savable to see more.

-{% endblock %}""") +package_template = jinjaenv.get_template("binary.html") +detail_template = jinjaenv.get_template("compare.html") +hash_template = jinjaenv.get_template("hash.html") +index_template = jinjaenv.get_template("index.html") +source_template = jinjaenv.get_template("source.html") def encode_and_buffer(iterator): buff = b"" -- cgit v1.2.3 From 2712edb550968ce7ec8cd9800241d7944666631a Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Thu, 1 Aug 2013 23:06:26 +0200 Subject: support hashing gif images * Rename "image_sha512" to "png_sha512". * dedup.image.ImageHash is now a base class for image hashes such as PNGHash and GIFHash. * Enable both hashes in importpkg. * Fix README. * Add new hash combinations to webapp. * Add "gif file not named *.gif" to issues in update_sharing. * Add redirect for "image_sha512" to webapp for backwards compatibility. --- README | 2 +- dedup/image.py | 67 +++++++++++++++++++++++++++++++++++++------------------ importpkg.py | 17 +++++++++----- schema.sql | 2 +- update_sharing.py | 3 ++- webapp.py | 10 ++++++++- 6 files changed, 70 insertions(+), 31 deletions(-) (limited to 'README') diff --git a/README b/README index a84807a..bf4da52 100644 --- a/README +++ b/README @@ -47,7 +47,7 @@ one copy in the archive. Finding PNG images that do not carry a .png file extension. - SELECT package.name, content.filename, content.size FROM content JOIN hash ON content.id = hash.cid JOIN package ON content.pid = package.id JOIN function ON hash.fid = function.id WHERE function.name = "image_sha512" AND lower(filename) NOT LIKE "%.png"; + SELECT package.name, content.filename, content.size FROM content JOIN hash ON content.id = hash.cid JOIN package ON content.pid = package.id JOIN function ON hash.fid = function.id WHERE function.name = "png_sha512" AND lower(filename) NOT LIKE "%.png"; Finding .gz files which either are not gziped or contain errors. diff --git a/dedup/image.py b/dedup/image.py index 1148890..c1f2de0 100644 --- a/dedup/image.py +++ b/dedup/image.py @@ -4,9 +4,10 @@ import struct import PIL.Image class ImageHash(object): - """A hash on the contents of an image. This disregards mode, depth and meta - information. Note that due to limitations in PIL and the image format - (interlacing) the full contents are stored and decoded in hexdigest.""" + """A hash on the contents of an image datat type supported by PIL. This + disregards mode, depth and meta information. Note that due to limitations + in PIL and the image format (interlacing) the full contents are stored and + decoded in hexdigest.""" maxsize = 1024 * 1024 * 32 # max memory usage is about 5 * maxpixels in bytes maxpixels = 1024 * 1024 * 32 @@ -19,33 +20,25 @@ class ImageHash(object): self.imagedetected = False self.content = io.BytesIO() + def detect(self): + raise NotImplementedError + def update(self, data): self.content.write(data) if self.content.tell() > self.maxsize: raise ValueError("maximum image size exceeded") - if self.imagedetected: - return - if self.content.tell() < 33: # header + IHDR - return - curvalue = self.content.getvalue() - if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"): - width, height = struct.unpack(">II", curvalue[16:24]) - if width * height > self.maxpixels: - raise ValueError("maximum image pixels exceeded") - self.imagedetected = True - return - raise ValueError("not a png image") + if not self.imagedetected: + self.imagedetected = self.detect() def copy(self): - new = ImageHash() - new.hashobj = self.hashobj.copy() + new = self.__class__(self.hashobj.copy()) new.imagedetected = self.imagedetected new.content = io.BytesIO(self.content.getvalue()) return new def hexdigest(self): if not self.imagedetected: - raise ValueError("not a png image") + raise ValueError("not a image") hashobj = self.hashobj.copy() pos = self.content.tell() try: @@ -53,7 +46,7 @@ class ImageHash(object): try: img = PIL.Image.open(self.content) except IOError: - raise ValueError("broken png header") + raise ValueError("broken header") width, height = img.size pack = lambda elem: struct.pack("BBBB", *elem) # special casing easy modes reduces memory usage @@ -64,13 +57,43 @@ class ImageHash(object): elif img.mode != "RGBA": try: img = img.convert("RGBA") - except (SyntaxError, IndexError, IOError): # crazy stuff from PIL - raise ValueError("error reading png image") + except (SyntaxError, IndexError, IOError): + # crazy stuff from PIL + raise ValueError("error reading image") try: for elem in img.getdata(): hashobj.update(pack(elem)) except (SyntaxError, IndexError, IOError): # crazy stuff from PIL - raise ValueError("error reading png image") + raise ValueError("error reading image") finally: self.content.seek(pos) return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height) + + +class PNGHash(ImageHash): + """A hash on the contents of a PNG image.""" + + def detect(self): + if self.content.tell() < 33: # header + IHDR + return False + curvalue = self.content.getvalue() + if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"): + width, height = struct.unpack(">II", curvalue[16:24]) + if width * height > self.maxpixels: + raise ValueError("maximum image pixels exceeded") + return True + raise ValueError("not a png image") + +class GIFHash(ImageHash): + """A hash on the contents of the first frame of a GIF image.""" + + def detect(self): + if self.content.tell() < 10: # magic + logical dimension + return False + curvalue = self.content.getvalue() + if curvalue.startswith((b"GIF87a", "GIF89a")): + width, height = struct.unpack(" self.maxpixels: + raise ValueError("maximum image pixels exceeded") + return True + raise ValueError("not a png image") diff --git a/importpkg.py b/importpkg.py index 02d4936..182ca01 100755 --- a/importpkg.py +++ b/importpkg.py @@ -19,7 +19,7 @@ from dedup.arreader import ArReader from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \ HashedStream, hash_file from dedup.compression import GzipDecompressor, DecompressedStream -from dedup.image import ImageHash +from dedup.image import GIFHash, PNGHash class MultiHash(object): def __init__(self, *hashes): @@ -44,17 +44,24 @@ def gziphash(): hashobj.name = "gzip_sha512" return HashBlacklist(hashobj, boring_sha512_hashes) -def imagehash(): - hashobj = ImageHash(hashlib.sha512()) +def pnghash(): + hashobj = PNGHash(hashlib.sha512()) hashobj = SuppressingHash(hashobj, (ValueError,)) - hashobj.name = "image_sha512" + hashobj.name = "png_sha512" + return hashobj + +def gifhash(): + hashobj = GIFHash(hashlib.sha512()) + hashobj = SuppressingHash(hashobj, (ValueError,)) + hashobj.name = "gif_sha512" return hashobj def get_hashes(tar): for elem in tar: if not elem.isreg(): # excludes hard links as well continue - hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash()) + hasher = MultiHash(sha512_nontrivial(), gziphash(), pnghash(), + gifhash()) hasher = hash_file(hasher, tar.extractfile(elem)) hashes = {} for hashobj in hasher.hashes: diff --git a/schema.sql b/schema.sql index 13a65aa..ddc6ccd 100644 --- a/schema.sql +++ b/schema.sql @@ -1,7 +1,7 @@ CREATE TABLE package (id INTEGER PRIMARY KEY, name TEXT UNIQUE, version TEXT, architecture TEXT, source TEXT); CREATE TABLE content (id INTEGER PRIMARY KEY, pid INTEGER, filename TEXT, size INTEGER, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE); CREATE TABLE function (id INTEGER PRIMARY KEY, name TEXT UNIQUE NOT NULL); -INSERT INTO function (name) VALUES ("sha512"), ("gzip_sha512"), ("image_sha512"); +INSERT INTO function (name) VALUES ("sha512"), ("gzip_sha512"), ("png_sha512"), ("gif_sha512"); CREATE TABLE hash (cid INTEGER, fid INTEGER NOT NULL, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE, FOREIGN KEY (fid) REFERENCES function(id)); CREATE TABLE dependency (pid INTEGER, required TEXT, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE); CREATE INDEX content_package_size_index ON content (pid, size); diff --git a/update_sharing.py b/update_sharing.py index 910662e..5ec6c7b 100755 --- a/update_sharing.py +++ b/update_sharing.py @@ -56,7 +56,8 @@ def main(): [(row[1],) for row in rows]) process_pkgdict(cur, pkgdict) cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');") - cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'image_sha512' AND lower(filename) NOT LIKE '%.png';") + cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';") + cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';") db.commit() if __name__ == "__main__": diff --git a/webapp.py b/webapp.py index 6c6f5b4..260268a 100755 --- a/webapp.py +++ b/webapp.py @@ -14,7 +14,10 @@ from dedup.utils import fetchiter hash_functions = [ ("sha512", "sha512"), - ("image_sha512", "image_sha512"), + ("png_sha512", "png_sha512"), + ("png_sha512", "gif_sha512"), + ("gif_sha512", "png_sha512"), + ("gif_sha512", "gif_sha512"), ("gzip_sha512", "gzip_sha512"), ("sha512", "gzip_sha512"), ("gzip_sha512", "sha512")] @@ -87,6 +90,11 @@ class Application(object): elif endpoint == "detail": return self.show_detail(args["package1"], args["package2"]) elif endpoint == "hash": + if args["function"] == "image_sha512": + # backwards compatibility + raise RequestRedirect("%s/hash/png_sha512/%s" % + (request.environ["SCRIPT_NAME"], + args["hashvalue"])) return self.show_hash(args["function"], args["hashvalue"]) elif endpoint == "index": if not request.environ["PATH_INFO"]: -- cgit v1.2.3