From 27b95909f061ae3ecb3ba1b8d46adfef98ca5e6f Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sun, 16 Feb 2020 08:21:20 +0100 Subject: drop support for Python 2.x --- importpkg.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'importpkg.py') diff --git a/importpkg.py b/importpkg.py index ce4a446..4693401 100755 --- a/importpkg.py +++ b/importpkg.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 """This tool reads a Debian package from stdin and emits a yaml stream on stdout. It does not access a database. Therefore it can be run in parallel and on multiple machines. The generated yaml contains multiple documents. The first @@ -8,15 +8,12 @@ And finally a document consisting of the string "commit" is emitted.""" import argparse import hashlib import sys +import urllib.request import zlib -try: - from urllib.request import urlopen -except ImportError: - from urllib2 import urlopen import yaml -from dedup.debpkg import DebExtractor, decodetarname, get_tar_hashes +from dedup.debpkg import DebExtractor, get_tar_hashes from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \ HashBlacklistContent from dedup.compression import GzipDecompressor @@ -63,7 +60,7 @@ class ImportpkgExtractor(DebExtractor): # deb822 currently returns :any dependencies raw. see #670679 deprelations = info.relations.get("depends", []) + \ info.relations.get("pre-depends", []) - depends = set(dep[0]["name"].split(u':', 1)[0] + depends = set(dep[0]["name"].split(':', 1)[0] for dep in deprelations if len(dep) == 1) self.callback(dict(package=info["package"], source=source, version=info["version"], @@ -73,22 +70,19 @@ class ImportpkgExtractor(DebExtractor): for name, size, hashes in get_tar_hashes(tarfileobj, self.hash_functions): try: - name = decodetarname(name) - except UnicodeDecodeError: + name.encode("utf8", "strict") + except UnicodeEncodeError: print("warning: skipping filename with encoding error") continue # skip files with non-utf8 encoding for now self.callback(dict(name=name, size=size, hashes=hashes)) raise ProcessingFinished() def main(): - try: - stdin = sys.stdin.buffer - except AttributeError: # python2 - stdin = sys.stdin parser = argparse.ArgumentParser() parser.add_argument("-H", "--hash", action="store", help="verify that stdin hash given sha256 hash") - parser.add_argument("input", nargs='?', default=stdin, type=urlopen, + parser.add_argument("input", nargs='?', default=sys.stdin.buffer, + type=urllib.request.urlopen, help="read from this location instead of stdin") args = parser.parse_args() dumper = yaml.SafeDumper(sys.stdout) -- cgit v1.2.3 From c2b5909eff090ebb3f19ab88308f0cc7b913157e Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 29 Dec 2021 15:24:34 +0100 Subject: ImageHash: gain a name property Instead of retroactively attaching a name to an ImageHash, autogenerate it via a property. Doing so also simplifies static type checking. --- dedup/image.py | 6 ++++++ importpkg.py | 10 ++-------- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'importpkg.py') diff --git a/dedup/image.py b/dedup/image.py index 2e64e6b..91321f4 100644 --- a/dedup/image.py +++ b/dedup/image.py @@ -69,9 +69,14 @@ class ImageHash: self.content.seek(pos) return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height) + @property + def name(self): + return self.name_prefix + self.hashobj.name + class PNGHash(ImageHash): """A hash on the contents of a PNG image.""" + name_prefix = "png_" def detect(self): if self.content.tell() < 33: # header + IHDR @@ -86,6 +91,7 @@ class PNGHash(ImageHash): class GIFHash(ImageHash): """A hash on the contents of the first frame of a GIF image.""" + name_prefix = "gif_" def detect(self): if self.content.tell() < 10: # magic + logical dimension diff --git a/importpkg.py b/importpkg.py index 4693401..6988c1d 100755 --- a/importpkg.py +++ b/importpkg.py @@ -31,16 +31,10 @@ def gziphash(): return HashBlacklistContent(hashobj, boring_content) def pnghash(): - hashobj = PNGHash(hashlib.sha512()) - hashobj = SuppressingHash(hashobj, (ValueError,)) - hashobj.name = "png_sha512" - return hashobj + return SuppressingHash(PNGHash(hashlib.sha512()), (ValueError,)) def gifhash(): - hashobj = GIFHash(hashlib.sha512()) - hashobj = SuppressingHash(hashobj, (ValueError,)) - hashobj.name = "gif_sha512" - return hashobj + return SuppressingHash(GIFHash(hashlib.sha512()), (ValueError,)) class ProcessingFinished(Exception): pass -- cgit v1.2.3 From 6b87bc371b91917980884d6dd20e39d3cda47fc7 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 29 Dec 2021 15:36:12 +0100 Subject: DecompressedHash: also gain a name property for consistency --- dedup/hashing.py | 7 +++++-- importpkg.py | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'importpkg.py') diff --git a/dedup/hashing.py b/dedup/hashing.py index 27f303c..9cebcbb 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -84,7 +84,7 @@ class HashBlacklistContent: class DecompressedHash: """Apply a decompression function before the hash. This class provides the hashlib interface (update, hexdigest, copy) excluding digest and name.""" - def __init__(self, decompressor, hashobj): + def __init__(self, decompressor, hashobj, name="unnamed"): """ @param decompressor: a decompression object like bz2.BZ2Decompressor or lzma.LZMADecompressor. It has to provide methods decompress and @@ -92,9 +92,11 @@ class DecompressedHash: method. @param hashobj: a hashlib-like obj providing methods update, hexdigest and copy + @param name: initialized the name property """ self.decompressor = decompressor self.hashobj = hashobj + self.name = name def update(self, data): self.hashobj.update(self.decompressor.decompress(data)) @@ -113,7 +115,8 @@ class DecompressedHash: return tmphash.hexdigest() def copy(self): - return DecompressedHash(self.decompressor.copy(), self.hashobj.copy()) + return DecompressedHash(self.decompressor.copy(), self.hashobj.copy(), + self.name) class SuppressingHash: """A hash that silences exceptions from the update and hexdigest methods of diff --git a/importpkg.py b/importpkg.py index 6988c1d..6772c4d 100755 --- a/importpkg.py +++ b/importpkg.py @@ -25,9 +25,9 @@ def sha512_nontrivial(): return HashBlacklistContent(hashlib.sha512(), boring_content) def gziphash(): - hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512()) + hashobj = hashlib.sha512() + hashobj = DecompressedHash(GzipDecompressor(), hashobj, "gzip_sha512") hashobj = SuppressingHash(hashobj, (ValueError, zlib.error)) - hashobj.name = "gzip_sha512" return HashBlacklistContent(hashobj, boring_content) def pnghash(): -- cgit v1.2.3 From e3fa967ef54a0a7b107eebc1ceb83f66e770dc34 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 29 Dec 2021 15:55:28 +0100 Subject: importpkg: fix suprression of boring content The content must be bytes. Passing str silently skips the suppression. --- importpkg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'importpkg.py') diff --git a/importpkg.py b/importpkg.py index 6772c4d..7bca70b 100755 --- a/importpkg.py +++ b/importpkg.py @@ -19,7 +19,7 @@ from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \ from dedup.compression import GzipDecompressor from dedup.image import GIFHash, PNGHash -boring_content = set(("", "\n")) +boring_content = set((b"", b"\n")) def sha512_nontrivial(): return HashBlacklistContent(hashlib.sha512(), boring_content) -- cgit v1.2.3