From 1056d8cd9601aa8c1328e5055439294bd89a50f1 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Fri, 29 Jul 2016 17:04:12 +0200 Subject: repository moved --- dedup/templates/base.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dedup/templates/base.html b/dedup/templates/base.html index 9dfb788..bac516e 100644 --- a/dedup/templates/base.html +++ b/dedup/templates/base.html @@ -15,7 +15,7 @@ -- cgit v1.2.3 From 5c8e852859b59b6b6bc0d058a5ed89fad55a2f29 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sun, 13 Nov 2016 08:44:58 +0100 Subject: autoimport: fix regresion in url computation The list path got inadvertently prepended to all binary package urls. Fixes: 420804c25797 ("autoimport: improve fetching package lists") --- autoimport.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/autoimport.py b/autoimport.py index c00dc4d..e51d052 100755 --- a/autoimport.py +++ b/autoimport.py @@ -26,8 +26,8 @@ from dedup.utils import open_compressed_mirror_url from readyaml import readyaml def process_http(pkgs, url, addhash=True): - url += "/dists/sid/main/binary-amd64/Packages" - with contextlib.closing(open_compressed_mirror_url(url)) as pkglist: + listurl = url + "/dists/sid/main/binary-amd64/Packages" + with contextlib.closing(open_compressed_mirror_url(listurl)) as pkglist: for pkg in deb822.Packages.iter_paragraphs(pkglist): name = pkg["Package"] if name in pkgs and \ -- cgit v1.2.3 From a9647ababd30925dc6c15f330a9158d94556cae5 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 13 Sep 2017 09:04:24 +0200 Subject: fix HashBlacklistContent.copy It wasn't copying the stored member and thus could be blacklist "wrong" content after a copy. --- dedup/hashing.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dedup/hashing.py b/dedup/hashing.py index 2a83929..c91fb64 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -80,8 +80,10 @@ class HashBlacklistContent(object): return self.hashobj.hexdigest() def copy(self): - return HashBlacklistContent(self.hashobj.copy(), self.blacklist, - self.maxlen) + new = HashBlacklistContent(self.hashobj.copy(), self.blacklist, + self.maxlen) + new.stored = self.stored + return new class DecompressedHash(object): """Apply a decompression function before the hash. This class provides the -- cgit v1.2.3 From 5df1185e5fa0830b546b4ef6af3cdadc655c16c8 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sat, 23 Sep 2017 10:33:43 +0200 Subject: add module dedup.filemagic This module is not used anywhere and thus its dependency on python3-magic is not recorded in the README. It can be used to guess the file type by looking at the contents using file magic. It is not a typical hash function, but it can be used for repurposing dedup for other analysers. --- dedup/filemagic.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 dedup/filemagic.py diff --git a/dedup/filemagic.py b/dedup/filemagic.py new file mode 100644 index 0000000..4cc9357 --- /dev/null +++ b/dedup/filemagic.py @@ -0,0 +1,43 @@ +"""A very strange "hash" that uses the magic module (python3-magic) to guess +the file type.""" + +import magic + +class FileDigester(object): + """A hashlib-like class to guess a filetype using the magic module.""" + FILE_BYTES_MAX = 1024 * 1024 # copied from file source + + def __init__(self): + self.buff = b"" + self.identification = None + + def _compute_identification(self): + try: + return magic.none_magic.buffer(self.buff) + except UnicodeDecodeError: + return "magic identification is not valid UTF-8" + + def update(self, buff): + if self.identification: + return + self.buff += buff + if len(self.buff) >= self.FILE_BYTES_MAX: + self.identification = self._compute_identification() + self.buff = None + + def identify(self): + """Return the guessed file magic identification.""" + if self.identification: + return self.identification + return self._compute_identification() + + def hexdigest(self): + """Compatibility with hashlib. An alias of identify. Doesn't return + hex.""" + return self.identify() + + def copy(self): + new = FileDigester() + new.buff = self.buff + new.identification = self.identification + return new -- cgit v1.2.3 From e77a1ebf8bda10494088bb6c72873d8ef214e0f3 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Mon, 25 Jun 2018 21:07:41 +0200 Subject: adapt to python3-magic/2:0.4.15-1 API --- dedup/filemagic.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/dedup/filemagic.py b/dedup/filemagic.py index 4cc9357..c5a6357 100644 --- a/dedup/filemagic.py +++ b/dedup/filemagic.py @@ -3,6 +3,12 @@ the file type.""" import magic +# It changed API a few times... +try: + _magic_identify = magic.from_buffer +except AttributeError: + _magic_identify = magic.none_magic.buffer + class FileDigester(object): """A hashlib-like class to guess a filetype using the magic module.""" FILE_BYTES_MAX = 1024 * 1024 # copied from file source @@ -13,7 +19,7 @@ class FileDigester(object): def _compute_identification(self): try: - return magic.none_magic.buffer(self.buff) + return _magic_identify(self.buff) except UnicodeDecodeError: return "magic identification is not valid UTF-8" -- cgit v1.2.3 From 27b95909f061ae3ecb3ba1b8d46adfef98ca5e6f Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sun, 16 Feb 2020 08:21:20 +0100 Subject: drop support for Python 2.x --- README | 2 +- autoimport.py | 10 +++------- dedup/arreader.py | 2 +- dedup/compression.py | 17 +++++++---------- dedup/debpkg.py | 32 +++++--------------------------- dedup/filemagic.py | 2 +- dedup/hashing.py | 14 +++++--------- dedup/image.py | 2 +- dedup/utils.py | 21 +++++++-------------- importpkg.py | 22 ++++++++-------------- readyaml.py | 2 +- update_sharing.py | 2 +- webapp.py | 4 ++-- 13 files changed, 43 insertions(+), 89 deletions(-) diff --git a/README b/README index 5329bd8..d0a488c 100644 --- a/README +++ b/README @@ -1,7 +1,7 @@ Required packages ----------------- - aptitude install python python-debian python-lzma python-jinja2 python-werkzeug sqlite3 python-imaging python-yaml python-concurrent.futures python-pkg-resources + aptitude install python3 python3-debian python3-lzma python3-jinja2 python3-werkzeug sqlite3 python3-imaging python3-yaml python3-concurrent.futures python3-pkg-resources Create a database ----------------- diff --git a/autoimport.py b/autoimport.py index e51d052..ec47db0 100755 --- a/autoimport.py +++ b/autoimport.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 """This scrip takes a directory or a http base url to a mirror and imports all packages contained. It has rather strong assumptions on the working directory. """ @@ -12,11 +12,7 @@ import sqlite3 import subprocess import sys import tempfile -try: - from urllib.parse import unquote -except ImportError: - from urllib import unquote - +import urllib.parse import concurrent.futures from debian import deb822 from debian.debian_support import version_compare @@ -47,7 +43,7 @@ def process_file(pkgs, filename): if len(parts) != 3: raise ValueError("filename not in form name_version_arch.deb") name, version, _ = parts - version = unquote(version) + version = urllib.parse.unquote(version) if name in pkgs and version_compare(pkgs[name]["version"], version) > 0: return pkgs[name] = dict(version=version, filename=filename) diff --git a/dedup/arreader.py b/dedup/arreader.py index e53efd9..8b14ff9 100644 --- a/dedup/arreader.py +++ b/dedup/arreader.py @@ -1,6 +1,6 @@ import struct -class ArReader(object): +class ArReader: """Streaming AR file reader. After constructing an object, you usually call read_magic once. Then you call read_entry in a loop and use the ArReader object as file-like only providing read() to read the respective diff --git a/dedup/compression.py b/dedup/compression.py index 8d1912b..161eda2 100644 --- a/dedup/compression.py +++ b/dedup/compression.py @@ -1,13 +1,10 @@ import bz2 import struct -import sys import zlib import lzma -crc32_type = "L" if sys.version_info.major >= 3 else "l" - -class GzipDecompressor(object): +class GzipDecompressor: """An interface to gzip which is similar to bz2.BZ2Decompressor and lzma.LZMADecompressor.""" def __init__(self): @@ -66,7 +63,7 @@ class GzipDecompressor(object): elif not self.sawheader: return self.inbuffer else: - expect = struct.pack("<" + crc32_type + "L", self.crc, self.size) + expect = struct.pack("= 3: - def opentar(filelike): - return tarfile.open(fileobj=filelike, mode="r|", encoding="utf8", - errors="surrogateescape") +def opentar(filelike): + return tarfile.open(fileobj=filelike, mode="r|", encoding="utf8", + errors="surrogateescape") - def decodetarname(name): - """Decoded name of a tarinfo. - @raises UnicodeDecodeError: - """ - try: - name.encode("utf8", "strict") - except UnicodeEncodeError as e: - if e.reason == "surrogates not allowed": - name.encode("utf8", "surrogateescape").decode("utf8", "strict") - return name -else: - def opentar(filelike): - return tarfile.open(fileobj=filelike, mode="r|") - - def decodetarname(name): - """Decoded name of a tarinfo. - @raises UnicodeDecodeError: - """ - return name.decode("utf8") - -class DebExtractor(object): +class DebExtractor: "Base class for extracting desired features from a Debian package." def __init__(self): diff --git a/dedup/filemagic.py b/dedup/filemagic.py index c5a6357..b71c276 100644 --- a/dedup/filemagic.py +++ b/dedup/filemagic.py @@ -9,7 +9,7 @@ try: except AttributeError: _magic_identify = magic.none_magic.buffer -class FileDigester(object): +class FileDigester: """A hashlib-like class to guess a filetype using the magic module.""" FILE_BYTES_MAX = 1024 * 1024 # copied from file source diff --git a/dedup/hashing.py b/dedup/hashing.py index c91fb64..21f14ea 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -1,10 +1,6 @@ import itertools -try: - from itertools import imap as map -except ImportError: - pass # in python3 map is already imap -class HashBlacklist(object): +class HashBlacklist: """Turn a hashlib-like object into a hash that returns None for some blacklisted hashes instead of the real hash value. @@ -35,7 +31,7 @@ class HashBlacklist(object): def copy(self): return HashBlacklist(self.hashobj.copy(), self.blacklist) -class HashBlacklistContent(object): +class HashBlacklistContent: """Turn a hashlib-like object into a hash that returns None for some blacklisted content instead of the real hash value. Unlike HashBlacklist, not the output of the hash is considered, but its input.""" @@ -85,7 +81,7 @@ class HashBlacklistContent(object): new.stored = self.stored return new -class DecompressedHash(object): +class DecompressedHash: """Apply a decompression function before the hash. This class provides the hashlib interface (update, hexdigest, copy) excluding digest and name.""" def __init__(self, decompressor, hashobj): @@ -119,7 +115,7 @@ class DecompressedHash(object): def copy(self): return DecompressedHash(self.decompressor.copy(), self.hashobj.copy()) -class SuppressingHash(object): +class SuppressingHash: """A hash that silences exceptions from the update and hexdigest methods of a hashlib-like object. If an exception has occurred, hexdigest always returns None.""" @@ -167,7 +163,7 @@ def hash_file(hashobj, filelike, blocksize=65536): data = filelike.read(blocksize) return hashobj -class HashedStream(object): +class HashedStream: """A file-like object, that supports sequential reading and hashes the contents on the fly.""" def __init__(self, filelike, hashobj): diff --git a/dedup/image.py b/dedup/image.py index 314eb44..2e64e6b 100644 --- a/dedup/image.py +++ b/dedup/image.py @@ -3,7 +3,7 @@ import struct import PIL.Image -class ImageHash(object): +class ImageHash: """A hash on the contents of an image data type supported by PIL. This disregards mode, depth and meta information. Note that due to limitations in PIL and the image format (interlacing) the full contents are stored and diff --git a/dedup/utils.py b/dedup/utils.py index dab6653..46f8e64 100644 --- a/dedup/utils.py +++ b/dedup/utils.py @@ -1,12 +1,6 @@ import errno -try: - from urllib.error import URLError, HTTPError -except ImportError: - from urllib2 import URLError, HTTPError -try: - from urllib.request import urlopen -except ImportError: - from urllib2 import urlopen +import urllib.error +import urllib.request from debian.debian_support import version_compare @@ -15,15 +9,14 @@ from dedup.compression import decompress def fetchiter(cursor): rows = cursor.fetchmany() while rows: - for row in rows: - yield row + yield from rows rows = cursor.fetchmany() def sql_add_version_compare(db): db.create_collation("debian_version", version_compare) db.create_function("debian_version_compare", 2, version_compare) -def open_compressed_mirror_url(url, extensions=(u".xz", u".gz", u"")): +def open_compressed_mirror_url(url, extensions=(".xz", ".gz", "")): """Fetch the given url. Try appending each of the given compression schemes and move on in case it doesn't exist. Decompress the resulting stream on the fly. @@ -31,11 +24,11 @@ def open_compressed_mirror_url(url, extensions=(u".xz", u".gz", u"")): """ for ext in extensions: try: - handle = urlopen(url + ext) - except HTTPError as error: + handle = urllib.request.urlopen(url + ext) + except urllib.error.HTTPError as error: if error.code != 404: raise - except URLError as error: + except urllib.error.URLError as error: if not hasattr(error.reason, "errno"): raise if error.reason.errno != errno.ENOENT: diff --git a/importpkg.py b/importpkg.py index ce4a446..4693401 100755 --- a/importpkg.py +++ b/importpkg.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 """This tool reads a Debian package from stdin and emits a yaml stream on stdout. It does not access a database. Therefore it can be run in parallel and on multiple machines. The generated yaml contains multiple documents. The first @@ -8,15 +8,12 @@ And finally a document consisting of the string "commit" is emitted.""" import argparse import hashlib import sys +import urllib.request import zlib -try: - from urllib.request import urlopen -except ImportError: - from urllib2 import urlopen import yaml -from dedup.debpkg import DebExtractor, decodetarname, get_tar_hashes +from dedup.debpkg import DebExtractor, get_tar_hashes from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \ HashBlacklistContent from dedup.compression import GzipDecompressor @@ -63,7 +60,7 @@ class ImportpkgExtractor(DebExtractor): # deb822 currently returns :any dependencies raw. see #670679 deprelations = info.relations.get("depends", []) + \ info.relations.get("pre-depends", []) - depends = set(dep[0]["name"].split(u':', 1)[0] + depends = set(dep[0]["name"].split(':', 1)[0] for dep in deprelations if len(dep) == 1) self.callback(dict(package=info["package"], source=source, version=info["version"], @@ -73,22 +70,19 @@ class ImportpkgExtractor(DebExtractor): for name, size, hashes in get_tar_hashes(tarfileobj, self.hash_functions): try: - name = decodetarname(name) - except UnicodeDecodeError: + name.encode("utf8", "strict") + except UnicodeEncodeError: print("warning: skipping filename with encoding error") continue # skip files with non-utf8 encoding for now self.callback(dict(name=name, size=size, hashes=hashes)) raise ProcessingFinished() def main(): - try: - stdin = sys.stdin.buffer - except AttributeError: # python2 - stdin = sys.stdin parser = argparse.ArgumentParser() parser.add_argument("-H", "--hash", action="store", help="verify that stdin hash given sha256 hash") - parser.add_argument("input", nargs='?', default=stdin, type=urlopen, + parser.add_argument("input", nargs='?', default=sys.stdin.buffer, + type=urllib.request.urlopen, help="read from this location instead of stdin") args = parser.parse_args() dumper = yaml.SafeDumper(sys.stdout) diff --git a/readyaml.py b/readyaml.py index b6f7316..a4837cf 100755 --- a/readyaml.py +++ b/readyaml.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 """This tool reads a yaml file as generated by importpkg.py on stdin and updates the database with the contents.""" diff --git a/update_sharing.py b/update_sharing.py index ac6c945..78e6171 100755 --- a/update_sharing.py +++ b/update_sharing.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import argparse import sqlite3 diff --git a/webapp.py b/webapp.py index f9e667e..69e9df8 100755 --- a/webapp.py +++ b/webapp.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import argparse import contextlib @@ -68,7 +68,7 @@ class InternalRedirect(Exception): self.target = target self.code = code -class Application(object): +class Application: def __init__(self, db): self.db = db self.routingmap = Map([ -- cgit v1.2.3 From b4251e2cc3453852d93ad6a2c0c116991982c2f9 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sun, 25 Oct 2020 10:01:37 +0100 Subject: use python3-pil instead of removed python3-imaging --- README | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README b/README index d0a488c..4572c8a 100644 --- a/README +++ b/README @@ -1,7 +1,7 @@ Required packages ----------------- - aptitude install python3 python3-debian python3-lzma python3-jinja2 python3-werkzeug sqlite3 python3-imaging python3-yaml python3-concurrent.futures python3-pkg-resources + aptitude install python3 python3-debian python3-lzma python3-jinja2 python3-werkzeug sqlite3 python3-pil python3-yaml python3-concurrent.futures python3-pkg-resources Create a database ----------------- -- cgit v1.2.3 From 4542d84439bbc6bd8f3151a9cb61d0ee85cd910e Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sun, 25 Oct 2020 10:20:34 +0100 Subject: externalize ar parsing to arpy --- README | 2 +- dedup/arreader.py | 79 ------------------------------------------------------- dedup/debpkg.py | 27 +++++++------------ 3 files changed, 10 insertions(+), 98 deletions(-) delete mode 100644 dedup/arreader.py diff --git a/README b/README index 4572c8a..ed4e8cb 100644 --- a/README +++ b/README @@ -1,7 +1,7 @@ Required packages ----------------- - aptitude install python3 python3-debian python3-lzma python3-jinja2 python3-werkzeug sqlite3 python3-pil python3-yaml python3-concurrent.futures python3-pkg-resources + aptitude install python3 python3-arpy python3-debian python3-lzma python3-jinja2 python3-werkzeug sqlite3 python3-pil python3-yaml python3-concurrent.futures python3-pkg-resources Create a database ----------------- diff --git a/dedup/arreader.py b/dedup/arreader.py deleted file mode 100644 index 8b14ff9..0000000 --- a/dedup/arreader.py +++ /dev/null @@ -1,79 +0,0 @@ -import struct - -class ArReader: - """Streaming AR file reader. After constructing an object, you usually - call read_magic once. Then you call read_entry in a loop and use the - ArReader object as file-like only providing read() to read the respective - file contents until you get EOFError from read_entry. - """ - global_magic = b"!\n" - file_magic = b"`\n" - - def __init__(self, fileobj): - """ - @param fileobj: a file-like object providing nothing but read(length) - """ - self.fileobj = fileobj - self.remaining = None - self.padding = 0 - - def read_magic(self): - """Consume the AR magic marker at the beginning of an AR file. You - must not call any other method before calling this method. - @raises ValueError: if the magic is not found - """ - data = self.fileobj.read(len(self.global_magic)) - if data != self.global_magic: - raise ValueError("ar global header not found") - self.remaining = 0 - - def read_entry(self): - """Read the next file header, return the filename and record the - length of the next file, so that the read method can be used to - exhaustively read the current file. - @rtype: bytes - @returns: the name of the next file - @raises ValueError: if the data format is wrong - @raises EOFError: when the end f the stream is reached - """ - self.skip_current_entry() - if self.padding: - if self.fileobj.read(1) != b'\n': - raise ValueError("missing ar padding") - self.padding = 0 - file_header = self.fileobj.read(60) - if not file_header: - raise EOFError("end of archive found") - parts = struct.unpack("16s 12s 6s 6s 8s 10s 2s", file_header) - parts = [p.rstrip(b"/ ") for p in parts] - if parts.pop() != self.file_magic: - raise ValueError("ar file header not found") - self.remaining = int(parts[5]) - self.padding = self.remaining % 2 - return parts[0] # name - - def skip_current_entry(self): - """Skip the remainder of the current file. This method must not be - called before calling read_entry. - @raises ValueError: if the archive appears truncated - """ - while self.remaining: - data = self.fileobj.read(min(4096, self.remaining)) - if not data: - raise ValueError("archive truncated") - self.remaining -= len(data) - - def read(self, length=None): - """ - @type length: int or None - @param length: number of bytes to read from the current file - @rtype: bytes - @returns: length or fewer bytes from the current file - """ - if length is None: - length = self.remaining - else: - length = min(self.remaining, length) - data = self.fileobj.read(length) - self.remaining -= len(data) - return data diff --git a/dedup/debpkg.py b/dedup/debpkg.py index 38086ec..0ecb123 100644 --- a/dedup/debpkg.py +++ b/dedup/debpkg.py @@ -1,8 +1,8 @@ import tarfile +import arpy from debian import deb822 -from dedup.arreader import ArReader from dedup.compression import decompress from dedup.hashing import hash_file @@ -52,45 +52,36 @@ class DebExtractor: @param filelike: is a file-like object containing the contents of the Debian packge and can be read once without seeks. """ - af = ArReader(filelike) - af.read_magic() - while True: - try: - name = af.read_entry() - except EOFError: - break - else: - self.handle_ar_member(name, af) + af = arpy.Archive(fileobj=filelike) + for member in af: + self.handle_ar_member(member) self.handle_ar_end() - def handle_ar_member(self, name, filelike): + def handle_ar_member(self, arfiledata: arpy.ArchiveFileData) -> None: """Handle an ar archive member of the Debian package. If you replace this method, you must also replace handle_ar_end and none of the methods handle_debversion, handle_control_tar or handle_data_tar are called. - @type name: bytes - @param name: is the name of the member - @param filelike: is a file-like object containing the contents of the - member and can be read once without seeks. """ + name = arfiledata.header.name if self.arstate == "start": if name != b"debian-binary": raise ValueError("debian-binary not found") - version = filelike.read() + version = arfiledata.read() self.handle_debversion(version) if not version.startswith(b"2."): raise ValueError("debian version not recognized") self.arstate = "version" elif self.arstate == "version": if name.startswith(b"control.tar"): - filelike = decompress(filelike, name[11:].decode("ascii")) + filelike = decompress(arfiledata, name[11:].decode("ascii")) self.handle_control_tar(opentar(filelike)) self.arstate = "control" elif not name.startswith(b"_"): raise ValueError("unexpected ar member %r" % name) elif self.arstate == "control": if name.startswith(b"data.tar"): - filelike = decompress(filelike, name[8:].decode("ascii")) + filelike = decompress(arfiledata, name[8:].decode("ascii")) self.handle_data_tar(opentar(filelike)) self.arstate = "data" elif not name.startswith(b"_"): -- cgit v1.2.3 From 88e66d81f9f04b531e8619efdca741244020b460 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sun, 25 Oct 2020 16:50:56 +0100 Subject: drop obsolete python modules Both lzma and concurrent.futures are now part of the standard library and solely exist as virtual packages. --- README | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README b/README index ed4e8cb..db7fde4 100644 --- a/README +++ b/README @@ -1,7 +1,7 @@ Required packages ----------------- - aptitude install python3 python3-arpy python3-debian python3-lzma python3-jinja2 python3-werkzeug sqlite3 python3-pil python3-yaml python3-concurrent.futures python3-pkg-resources + aptitude install python3 python3-arpy python3-debian python3-jinja2 python3-werkzeug sqlite3 python3-pil python3-yaml python3-pkg-resources Create a database ----------------- -- cgit v1.2.3 From be595dd6480d00db719d120b83e3a303392f648b Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 29 Dec 2021 12:00:26 +0100 Subject: DecompressedStream: eliminate redundant closed field --- dedup/compression.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/dedup/compression.py b/dedup/compression.py index 161eda2..ea921c4 100644 --- a/dedup/compression.py +++ b/dedup/compression.py @@ -103,10 +103,9 @@ class DecompressedStream: self.decompressor = decompressor self.buff = b"" self.pos = 0 - self.closed = False def _fill_buff_until(self, predicate): - assert not self.closed + assert self.fileobj is not None data = True while True: if predicate(self.buff) or not data: @@ -143,12 +142,12 @@ class DecompressedStream: return iter(self.readline, b'') def tell(self): - assert not self.closed + assert self.fileobj is not None return self.pos def seek(self, pos): """Forward seeks by absolute position only.""" - assert not self.closed + assert self.fileobj is not None if pos < self.pos: raise ValueError("negative seek not allowed on decompressed stream") while True: @@ -162,12 +161,11 @@ class DecompressedStream: return def close(self): - if not self.closed: + if self.fileobj is not None: self.fileobj.close() self.fileobj = None self.decompressor = None self.buff = b"" - self.closed = True decompressors = { '.gz': GzipDecompressor, -- cgit v1.2.3 From 775bdde52ad5375773c0635e4ce52f74cb820525 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 29 Dec 2021 13:43:48 +0100 Subject: DecompressedStream: avoid mixing types for variable data The local variable data can be bool or bytes. That's inconvenient for static type checkers. Avoid doing so. --- dedup/compression.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/dedup/compression.py b/dedup/compression.py index ea921c4..9cd63e5 100644 --- a/dedup/compression.py +++ b/dedup/compression.py @@ -106,15 +106,13 @@ class DecompressedStream: def _fill_buff_until(self, predicate): assert self.fileobj is not None - data = True - while True: - if predicate(self.buff) or not data: - return + while not predicate(self.buff): data = self.fileobj.read(self.blocksize) if data: self.buff += self.decompressor.decompress(data) elif hasattr(self.decompressor, "flush"): self.buff += self.decompressor.flush() + break def _read_from_buff(self, length): ret = self.buff[:length] -- cgit v1.2.3 From 529f985adaabfe0c63a1e7ad8d97ec36bb881e52 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 29 Dec 2021 14:55:43 +0100 Subject: drop unused function sql_add_version_compare --- dedup/utils.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/dedup/utils.py b/dedup/utils.py index 46f8e64..d3a27a0 100644 --- a/dedup/utils.py +++ b/dedup/utils.py @@ -2,8 +2,6 @@ import errno import urllib.error import urllib.request -from debian.debian_support import version_compare - from dedup.compression import decompress def fetchiter(cursor): @@ -12,10 +10,6 @@ def fetchiter(cursor): yield from rows rows = cursor.fetchmany() -def sql_add_version_compare(db): - db.create_collation("debian_version", version_compare) - db.create_function("debian_version_compare", 2, version_compare) - def open_compressed_mirror_url(url, extensions=(".xz", ".gz", "")): """Fetch the given url. Try appending each of the given compression schemes and move on in case it doesn't exist. Decompress the resulting -- cgit v1.2.3 From 2cb95eb8c68a692b0abb535925e8b55175285ea4 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 29 Dec 2021 15:04:35 +0100 Subject: don't return the first parameter from hash_file Returning the object gets us into trouble as to what precisely the return type is at no benefit. --- dedup/debpkg.py | 2 +- dedup/hashing.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/dedup/debpkg.py b/dedup/debpkg.py index 0ecb123..de00e60 100644 --- a/dedup/debpkg.py +++ b/dedup/debpkg.py @@ -29,7 +29,7 @@ def get_tar_hashes(tar, hash_functions): if not elem.isreg(): # excludes hard links as well continue hasher = MultiHash(*[func() for func in hash_functions]) - hasher = hash_file(hasher, tar.extractfile(elem)) + hash_file(hasher, tar.extractfile(elem)) hashes = {} for hashobj in hasher.hashes: hashvalue = hashobj.hexdigest() diff --git a/dedup/hashing.py b/dedup/hashing.py index 21f14ea..27f303c 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -161,7 +161,6 @@ def hash_file(hashobj, filelike, blocksize=65536): while data: hashobj.update(data) data = filelike.read(blocksize) - return hashobj class HashedStream: """A file-like object, that supports sequential reading and hashes the -- cgit v1.2.3 From c2b5909eff090ebb3f19ab88308f0cc7b913157e Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 29 Dec 2021 15:24:34 +0100 Subject: ImageHash: gain a name property Instead of retroactively attaching a name to an ImageHash, autogenerate it via a property. Doing so also simplifies static type checking. --- dedup/image.py | 6 ++++++ importpkg.py | 10 ++-------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/dedup/image.py b/dedup/image.py index 2e64e6b..91321f4 100644 --- a/dedup/image.py +++ b/dedup/image.py @@ -69,9 +69,14 @@ class ImageHash: self.content.seek(pos) return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height) + @property + def name(self): + return self.name_prefix + self.hashobj.name + class PNGHash(ImageHash): """A hash on the contents of a PNG image.""" + name_prefix = "png_" def detect(self): if self.content.tell() < 33: # header + IHDR @@ -86,6 +91,7 @@ class PNGHash(ImageHash): class GIFHash(ImageHash): """A hash on the contents of the first frame of a GIF image.""" + name_prefix = "gif_" def detect(self): if self.content.tell() < 10: # magic + logical dimension diff --git a/importpkg.py b/importpkg.py index 4693401..6988c1d 100755 --- a/importpkg.py +++ b/importpkg.py @@ -31,16 +31,10 @@ def gziphash(): return HashBlacklistContent(hashobj, boring_content) def pnghash(): - hashobj = PNGHash(hashlib.sha512()) - hashobj = SuppressingHash(hashobj, (ValueError,)) - hashobj.name = "png_sha512" - return hashobj + return SuppressingHash(PNGHash(hashlib.sha512()), (ValueError,)) def gifhash(): - hashobj = GIFHash(hashlib.sha512()) - hashobj = SuppressingHash(hashobj, (ValueError,)) - hashobj.name = "gif_sha512" - return hashobj + return SuppressingHash(GIFHash(hashlib.sha512()), (ValueError,)) class ProcessingFinished(Exception): pass -- cgit v1.2.3 From 6b87bc371b91917980884d6dd20e39d3cda47fc7 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 29 Dec 2021 15:36:12 +0100 Subject: DecompressedHash: also gain a name property for consistency --- dedup/hashing.py | 7 +++++-- importpkg.py | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/dedup/hashing.py b/dedup/hashing.py index 27f303c..9cebcbb 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -84,7 +84,7 @@ class HashBlacklistContent: class DecompressedHash: """Apply a decompression function before the hash. This class provides the hashlib interface (update, hexdigest, copy) excluding digest and name.""" - def __init__(self, decompressor, hashobj): + def __init__(self, decompressor, hashobj, name="unnamed"): """ @param decompressor: a decompression object like bz2.BZ2Decompressor or lzma.LZMADecompressor. It has to provide methods decompress and @@ -92,9 +92,11 @@ class DecompressedHash: method. @param hashobj: a hashlib-like obj providing methods update, hexdigest and copy + @param name: initialized the name property """ self.decompressor = decompressor self.hashobj = hashobj + self.name = name def update(self, data): self.hashobj.update(self.decompressor.decompress(data)) @@ -113,7 +115,8 @@ class DecompressedHash: return tmphash.hexdigest() def copy(self): - return DecompressedHash(self.decompressor.copy(), self.hashobj.copy()) + return DecompressedHash(self.decompressor.copy(), self.hashobj.copy(), + self.name) class SuppressingHash: """A hash that silences exceptions from the update and hexdigest methods of diff --git a/importpkg.py b/importpkg.py index 6988c1d..6772c4d 100755 --- a/importpkg.py +++ b/importpkg.py @@ -25,9 +25,9 @@ def sha512_nontrivial(): return HashBlacklistContent(hashlib.sha512(), boring_content) def gziphash(): - hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512()) + hashobj = hashlib.sha512() + hashobj = DecompressedHash(GzipDecompressor(), hashobj, "gzip_sha512") hashobj = SuppressingHash(hashobj, (ValueError, zlib.error)) - hashobj.name = "gzip_sha512" return HashBlacklistContent(hashobj, boring_content) def pnghash(): -- cgit v1.2.3 From e3fa967ef54a0a7b107eebc1ceb83f66e770dc34 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 29 Dec 2021 15:55:28 +0100 Subject: importpkg: fix suprression of boring content The content must be bytes. Passing str silently skips the suppression. --- importpkg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/importpkg.py b/importpkg.py index 6772c4d..7bca70b 100755 --- a/importpkg.py +++ b/importpkg.py @@ -19,7 +19,7 @@ from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \ from dedup.compression import GzipDecompressor from dedup.image import GIFHash, PNGHash -boring_content = set(("", "\n")) +boring_content = set((b"", b"\n")) def sha512_nontrivial(): return HashBlacklistContent(hashlib.sha512(), boring_content) -- cgit v1.2.3 From f02881a79e25f42cefbb897d7ace1373d220ee73 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 29 Dec 2021 18:05:36 +0100 Subject: autoimport.py: convert to use pathlib --- autoimport.py | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/autoimport.py b/autoimport.py index ec47db0..d9ee0a3 100755 --- a/autoimport.py +++ b/autoimport.py @@ -7,7 +7,7 @@ import argparse import contextlib import errno import multiprocessing -import os +import pathlib import sqlite3 import subprocess import sys @@ -36,22 +36,21 @@ def process_http(pkgs, url, addhash=True): pkgs[name] = inst def process_file(pkgs, filename): - base = os.path.basename(filename) - if not base.endswith(".deb"): + if filename.suffix != ".deb": raise ValueError("filename does not end in .deb") - parts = base.split("_") + parts = filename.name.split("_") if len(parts) != 3: raise ValueError("filename not in form name_version_arch.deb") name, version, _ = parts version = urllib.parse.unquote(version) if name in pkgs and version_compare(pkgs[name]["version"], version) > 0: return - pkgs[name] = dict(version=version, filename=filename) + pkgs[name] = dict(version=version, filename=str(filename)) def process_dir(pkgs, d): - for entry in os.listdir(d): + for entry in d.iterdir(): try: - process_file(pkgs, os.path.join(d, entry)) + process_file(pkgs, entry) except ValueError: pass @@ -63,11 +62,11 @@ def process_pkg(name, pkgdict, outpath): importcmd.extend(["-H", pkgdict["sha256hash"]]) if filename.startswith(("http://", "https://", "ftp://", "file://")): importcmd.append(filename) - with open(outpath, "w") as outp: + with outpath.open("w") as outp: subprocess.check_call(importcmd, stdout=outp, close_fds=True) else: with open(filename) as inp: - with open(outpath, "w") as outp: + with outpath.open("w") as outp: subprocess.check_call(importcmd, stdin=inp, stdout=outp, close_fds=True) print("preprocessed %s" % name) @@ -86,7 +85,7 @@ def main(): parser.add_argument("files", nargs='+', help="files or directories or repository urls") args = parser.parse_args() - tmpdir = tempfile.mkdtemp(prefix="debian-dedup") + tmpdir = pathlib.Path(tempfile.mkdtemp(prefix="debian-dedup")) db = sqlite3.connect(args.database) cur = db.cursor() cur.execute("PRAGMA foreign_keys = ON;") @@ -96,10 +95,12 @@ def main(): print("processing %s" % d) if d.startswith(("http://", "https://", "ftp://", "file://")): process_http(pkgs, d, not args.noverify) - elif os.path.isdir(d): - process_dir(pkgs, d) else: - process_file(pkgs, d) + dp = pathlib.Path(d) + if dp.is_dir(): + process_dir(pkgs, dp) + else: + process_file(pkgs, dp) print("reading database") cur.execute("SELECT name, version FROM package;") @@ -115,23 +116,22 @@ def main(): with e: fs = {} for name, pkg in pkgs.items(): - outpath = os.path.join(tmpdir, name) - fs[e.submit(process_pkg, name, pkg, outpath)] = name + fs[e.submit(process_pkg, name, pkg, tmpdir / name)] = name for f in concurrent.futures.as_completed(fs.keys()): name = fs[f] if f.exception(): print("%s failed to import: %r" % (name, f.exception())) continue - inf = os.path.join(tmpdir, name) + inf = tmpdir / name print("sqlimporting %s" % name) - with open(inf) as inp: + with inf.open() as inp: try: readyaml(db, inp) except Exception as exc: print("%s failed sql with exception %r" % (name, exc)) else: - os.unlink(inf) + inf.unlink() if args.prune: delpkgs = knownpkgs - distpkgs @@ -142,12 +142,12 @@ def main(): # due to ON DELETE CASCADE clauses. db.commit() try: - os.rmdir(tmpdir) + tmpdir.rmdir() except OSError as err: if err.errno != errno.ENOTEMPTY: raise print("keeping temporary directory %s due to failed packages %s" % - (tmpdir, " ".join(os.listdir(tmpdir)))) + (tmpdir, " ".join(map(str, tmpdir.iterdir())))) if __name__ == "__main__": main() -- cgit v1.2.3 From c7615fcb537f547da3068d3e489437e70db58447 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 29 Dec 2021 20:34:51 +0100 Subject: webapp: forward compatibility with newer werkzeug --- webapp.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/webapp.py b/webapp.py index 69e9df8..8f5d342 100755 --- a/webapp.py +++ b/webapp.py @@ -11,7 +11,10 @@ from werkzeug.exceptions import HTTPException, NotFound from werkzeug.routing import Map, Rule from werkzeug.utils import redirect from werkzeug.wrappers import Request, Response -from werkzeug.wsgi import SharedDataMiddleware +try: + from werkzeug.middleware.shared_data import SharedDataMiddleware +except ImportError: + from werkzeug.wsgi import SharedDataMiddleware from dedup.utils import fetchiter -- cgit v1.2.3 From e118de84d60e6f0d7662dcbb6aa362f452dda6ba Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 29 Dec 2021 20:56:03 +0100 Subject: webapp: improve performance html_response expects a str-generator, but when we call the render method, we receive a plain str. It can be iterated - one character at a time. That's what encode_and_buffer will do in this case. So better stream all the time. --- webapp.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/webapp.py b/webapp.py index 8f5d342..9993cb0 100755 --- a/webapp.py +++ b/webapp.py @@ -100,7 +100,7 @@ class Application: elif endpoint == "index": if not request.environ["PATH_INFO"]: raise InternalRedirect("/") - return html_response(index_template.render(dict(urlroot=""))) + return html_response(index_template.stream(dict(urlroot=""))) elif endpoint == "source": return self.show_source(args["package"]) raise NotFound() @@ -159,7 +159,7 @@ class Application: (params["pid"],)) params["issues"] = dict(cur.fetchall()) cur.close() - return html_response(package_template.render(params)) + return html_response(package_template.stream(params)) def compute_comparison(self, pid1, pid2): """Compute a sequence of comparison objects ordered by the size of the @@ -237,7 +237,7 @@ class Application: raise NotFound() params = dict(function=function, hashvalue=hashvalue, entries=entries, urlroot="../..") - return html_response(hash_template.render(params)) + return html_response(hash_template.stream(params)) def show_source(self, package): with contextlib.closing(self.db.cursor()) as cur: @@ -256,7 +256,7 @@ class Application: if not (oldentry and oldentry["savable"] >= size): binpkgs[binary] = entry params = dict(source=package, packages=binpkgs, urlroot="..") - return html_response(source_template.render(params)) + return html_response(source_template.stream(params)) def main(): parser = argparse.ArgumentParser() -- cgit v1.2.3 From 69a8861b704c969260ecb55110d8e41cd9aaf0a7 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 29 Dec 2021 21:00:04 +0100 Subject: webapp: speed up encode_and_buffer We now know that our parameter is a jinja2.environment.TemplateStream. Enable buffering and accumulate via an io.BytesIO to avoid O(n^2) append. --- webapp.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/webapp.py b/webapp.py index 9993cb0..d91d724 100755 --- a/webapp.py +++ b/webapp.py @@ -3,6 +3,7 @@ import argparse import contextlib import datetime +import io import sqlite3 from wsgiref.simple_server import make_server @@ -49,15 +50,16 @@ hash_template = jinjaenv.get_template("hash.html") index_template = jinjaenv.get_template("index.html") source_template = jinjaenv.get_template("source.html") -def encode_and_buffer(iterator): - buff = b"" - for elem in iterator: - buff += elem.encode("utf8") - if len(buff) >= 2048: - yield buff - buff = b"" - if buff: - yield buff +def encode_and_buffer(stream): + stream.enable_buffering(16) + buff = io.BytesIO() + for elem in stream: + buff.write(elem.encode("utf8")) + if buff.tell() >= 2048: + yield buff.getvalue() + buff = io.BytesIO() + if buff.tell() > 0: + yield buff.getvalue() def html_response(unicode_iterator, max_age=24 * 60 * 60): resp = Response(encode_and_buffer(unicode_iterator), mimetype="text/html") -- cgit v1.2.3 From d9d757792682e98e006d93f6bcbb94688d3a0f3f Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 29 Dec 2021 21:05:58 +0100 Subject: autoimport: avoid changing variable type knownpkgvers is a dict while knownpkgs is a set. Separating them helps static type checkers. --- autoimport.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/autoimport.py b/autoimport.py index d9ee0a3..eb610b4 100755 --- a/autoimport.py +++ b/autoimport.py @@ -104,14 +104,15 @@ def main(): print("reading database") cur.execute("SELECT name, version FROM package;") - knownpkgs = dict((row[0], row[1]) for row in cur.fetchall()) + knownpkgvers = dict((row[0], row[1]) for row in cur.fetchall()) distpkgs = set(pkgs.keys()) if args.new: for name in distpkgs: - if name in knownpkgs and version_compare(pkgs[name]["version"], - knownpkgs[name]) <= 0: + if name in knownpkgvers and \ + version_compare(pkgs[name]["version"], knownpkgvers[name]) <= 0: del pkgs[name] - knownpkgs = set(knownpkgs) + knownpkgs = set(knownpkgvers) + del knownpkgvers with e: fs = {} -- cgit v1.2.3 From 9b2cd74186f74a3c3e7c10b0ce39ebd992b11d36 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 29 Dec 2021 21:14:38 +0100 Subject: webapp: avoid changing variable type Again static type checking is the driver for the change here. --- webapp.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/webapp.py b/webapp.py index d91d724..0d9e3f9 100755 --- a/webapp.py +++ b/webapp.py @@ -22,18 +22,18 @@ from dedup.utils import fetchiter jinjaenv = jinja2.Environment(loader=jinja2.PackageLoader("dedup", "templates")) def format_size(size): - size = float(size) + sizef = float(size) fmt = "%d B" - if size >= 1024: - size /= 1024 + if sizef >= 1024: + sizef /= 1024 fmt = "%.1f KB" - if size >= 1024: - size /= 1024 + if sizef >= 1024: + sizef /= 1024 fmt = "%.1f MB" - if size >= 1024: - size /= 1024 + if sizef >= 1024: + sizef /= 1024 fmt = "%.1f GB" - return fmt % size + return fmt % sizef def function_combination(function1, function2): if function1 == function2: -- cgit v1.2.3 From 35c22db308a91e82ed4a5f6a9c937c186d81d810 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 29 Dec 2021 22:14:50 +0100 Subject: DecompressedStream: fix endless loop Fixes: 775bdde52ad5 ("DecompressedStream: avoid mixing types for variable data") --- dedup/compression.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dedup/compression.py b/dedup/compression.py index 9cd63e5..6d361ac 100644 --- a/dedup/compression.py +++ b/dedup/compression.py @@ -110,8 +110,9 @@ class DecompressedStream: data = self.fileobj.read(self.blocksize) if data: self.buff += self.decompressor.decompress(data) - elif hasattr(self.decompressor, "flush"): - self.buff += self.decompressor.flush() + else: + if hasattr(self.decompressor, "flush"): + self.buff += self.decompressor.flush() break def _read_from_buff(self, length): -- cgit v1.2.3 From 0b4882ecf657d70dd3236dcf176e083bf08dccdd Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Thu, 30 Dec 2021 17:52:38 +0100 Subject: DecompressedStream: improve performance When the decompression ratio is huge, we may be faced with a large (multiple megabytes) bytes object. Slicing that object incurs a copy becomes O(n^2) while appending and trimming a bytearray is much faster. --- dedup/compression.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dedup/compression.py b/dedup/compression.py index 6d361ac..da6e9a0 100644 --- a/dedup/compression.py +++ b/dedup/compression.py @@ -101,7 +101,7 @@ class DecompressedStream: """ self.fileobj = fileobj self.decompressor = decompressor - self.buff = b"" + self.buff = bytearray() self.pos = 0 def _fill_buff_until(self, predicate): @@ -116,8 +116,8 @@ class DecompressedStream: break def _read_from_buff(self, length): - ret = self.buff[:length] - self.buff = self.buff[length:] + ret = bytes(self.buff[:length]) + self.buff[:length] = b"" self.pos += length return ret @@ -164,7 +164,7 @@ class DecompressedStream: self.fileobj.close() self.fileobj = None self.decompressor = None - self.buff = b"" + self.buff = bytearray() decompressors = { '.gz': GzipDecompressor, -- cgit v1.2.3 From 1631e91b116ebf04ba9bd332e12c2f165263088b Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Fri, 31 Dec 2021 13:00:29 +0100 Subject: webapp.py: consistently close cursors using context managers --- webapp.py | 72 +++++++++++++++++++++++++++++++-------------------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/webapp.py b/webapp.py index 0d9e3f9..162a5a4 100755 --- a/webapp.py +++ b/webapp.py @@ -84,6 +84,9 @@ class Application: Rule("/source/", methods=("GET",), endpoint="source"), ]) + def cursor(self): + return contextlib.closing(self.db.cursor()) + @Request.application def __call__(self, request): mapadapter = self.routingmap.bind_to_environ(request.environ) @@ -112,7 +115,7 @@ class Application: return e def get_details(self, package): - with contextlib.closing(self.db.cursor()) as cur: + with self.cursor() as cur: cur.execute("SELECT id, version, architecture FROM package WHERE name = ?;", (package,)) row = cur.fetchone() @@ -132,14 +135,14 @@ class Application: return details def get_dependencies(self, pid): - with contextlib.closing(self.db.cursor()) as cur: + with self.cursor() as cur: cur.execute("SELECT required FROM dependency WHERE pid = ?;", (pid,)) return set(row[0] for row in fetchiter(cur)) def cached_sharedstats(self, pid): sharedstats = {} - with contextlib.closing(self.db.cursor()) as cur: + with self.cursor() as cur: cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;", (pid,)) for pid2, package2, func1, func2, files, size in fetchiter(cur): @@ -156,11 +159,10 @@ class Application: params["dependencies"] = self.get_dependencies(params["pid"]) params["shared"] = self.cached_sharedstats(params["pid"]) params["urlroot"] = ".." - cur = self.db.cursor() - cur.execute("SELECT content.filename, issue.issue FROM content JOIN issue ON content.id = issue.cid WHERE content.pid = ?;", - (params["pid"],)) - params["issues"] = dict(cur.fetchall()) - cur.close() + with self.cursor() as cur: + cur.execute("SELECT content.filename, issue.issue FROM content JOIN issue ON content.id = issue.cid WHERE content.pid = ?;", + (params["pid"],)) + params["issues"] = dict(cur.fetchall()) return html_response(package_template.stream(params)) def compute_comparison(self, pid1, pid2): @@ -173,35 +175,33 @@ class Application: * matches: A mapping from filenames in package 2 (pid2) to a mapping from hash function pairs to hash values. """ - cur = self.db.cursor() - cur.execute("SELECT content.id, content.filename, content.size, hash.hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid JOIN function ON hash.fid = function.id WHERE pid = ? AND function.name = 'sha512' ORDER BY size DESC;", - (pid1,)) - cursize = -1 - files = dict() - minmatch = 2 if pid1 == pid2 else 1 - cur2 = self.db.cursor() - for cid, filename, size, hashvalue in fetchiter(cur): - if cursize != size: - for entry in files.values(): - if len(entry["matches"]) >= minmatch: - yield entry - files.clear() - cursize = size + with self.cursor() as cur, self.cursor() as cur2: + cur.execute("SELECT content.id, content.filename, content.size, hash.hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid JOIN function ON hash.fid = function.id WHERE pid = ? AND function.name = 'sha512' ORDER BY size DESC;", + (pid1,)) + cursize = -1 + files = dict() + minmatch = 2 if pid1 == pid2 else 1 + cur2 = self.db.cursor() + for cid, filename, size, hashvalue in fetchiter(cur): + if cursize != size: + for entry in files.values(): + if len(entry["matches"]) >= minmatch: + yield entry + files.clear() + cursize = size - if hashvalue in files: - files[hashvalue]["filenames"].add(filename) - continue + if hashvalue in files: + files[hashvalue]["filenames"].add(filename) + continue - entry = dict(filenames=set((filename,)), size=size, matches={}) - files[hashvalue] = entry + entry = dict(filenames=set((filename,)), size=size, matches={}) + files[hashvalue] = entry - cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ? AND fa.eqclass = fb.eqclass;", - (cid, pid2)) - for func1, hashvalue, func2, filename in fetchiter(cur2): - entry["matches"].setdefault(filename, {})[func1, func2] = \ - hashvalue - cur2.close() - cur.close() + cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ? AND fa.eqclass = fb.eqclass;", + (cid, pid2)) + for func1, hashvalue, func2, filename in fetchiter(cur2): + entry["matches"].setdefault(filename, {})[func1, func2] = \ + hashvalue for entry in files.values(): if len(entry["matches"]) >= minmatch: @@ -221,7 +221,7 @@ class Application: return html_response(detail_template.stream(params)) def show_hash(self, function, hashvalue): - with contextlib.closing(self.db.cursor()) as cur: + with self.cursor() as cur: cur.execute("SELECT package.name, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;", (function, hashvalue,)) entries = [dict(package=package, filename=filename, size=size, @@ -242,7 +242,7 @@ class Application: return html_response(hash_template.stream(params)) def show_source(self, package): - with contextlib.closing(self.db.cursor()) as cur: + with self.cursor() as cur: cur.execute("SELECT name FROM package WHERE source = ?;", (package,)) binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur)) -- cgit v1.2.3 From 5b359b10053cbade539246eec26e86b44793ca40 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Fri, 31 Dec 2021 15:24:01 +0100 Subject: dedup.utils: uninline helper function iterate_packages --- autoimport.py | 26 +++++++++++--------------- dedup/utils.py | 11 +++++++++++ 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/autoimport.py b/autoimport.py index eb610b4..0f518c6 100755 --- a/autoimport.py +++ b/autoimport.py @@ -4,7 +4,6 @@ packages contained. It has rather strong assumptions on the working directory. """ import argparse -import contextlib import errno import multiprocessing import pathlib @@ -14,26 +13,23 @@ import sys import tempfile import urllib.parse import concurrent.futures -from debian import deb822 from debian.debian_support import version_compare -from dedup.utils import open_compressed_mirror_url +from dedup.utils import iterate_packages from readyaml import readyaml def process_http(pkgs, url, addhash=True): - listurl = url + "/dists/sid/main/binary-amd64/Packages" - with contextlib.closing(open_compressed_mirror_url(listurl)) as pkglist: - for pkg in deb822.Packages.iter_paragraphs(pkglist): - name = pkg["Package"] - if name in pkgs and \ - version_compare(pkgs[name]["version"], pkg["Version"]) > 0: - continue - inst = dict(version=pkg["Version"], - filename="%s/%s" % (url, pkg["Filename"])) - if addhash: - inst["sha256hash"] = pkg["SHA256"] - pkgs[name] = inst + for pkg in iterate_packages(url, "amd64"): + name = pkg["Package"] + if name in pkgs and \ + version_compare(pkgs[name]["version"], pkg["Version"]) > 0: + continue + inst = dict(version=pkg["Version"], + filename="%s/%s" % (url, pkg["Filename"])) + if addhash: + inst["sha256hash"] = pkg["SHA256"] + pkgs[name] = inst def process_file(pkgs, filename): if filename.suffix != ".deb": diff --git a/dedup/utils.py b/dedup/utils.py index d3a27a0..55cdef0 100644 --- a/dedup/utils.py +++ b/dedup/utils.py @@ -1,7 +1,10 @@ +import contextlib import errno import urllib.error import urllib.request +import debian.deb822 + from dedup.compression import decompress def fetchiter(cursor): @@ -30,3 +33,11 @@ def open_compressed_mirror_url(url, extensions=(".xz", ".gz", "")): else: return decompress(handle, ext) raise OSError(errno.ENOENT, "No such file or directory") + +def iterate_packages(mirror, architecture, distribution="sid", section="main"): + """Download the relevant binary package list and generate + debian.deb822.Packages objects per listed package.""" + url = "%s/dists/%s/%s/binary-%s/Packages" % \ + (mirror, distribution, section, architecture) + with contextlib.closing(open_compressed_mirror_url(url)) as pkglist: + yield from debian.deb822.Packages.iter_paragraphs(pkglist) -- cgit v1.2.3