From 27b95909f061ae3ecb3ba1b8d46adfef98ca5e6f Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sun, 16 Feb 2020 08:21:20 +0100 Subject: drop support for Python 2.x --- dedup/debpkg.py | 32 +++++--------------------------- 1 file changed, 5 insertions(+), 27 deletions(-) (limited to 'dedup/debpkg.py') diff --git a/dedup/debpkg.py b/dedup/debpkg.py index 3a30b3e..38086ec 100644 --- a/dedup/debpkg.py +++ b/dedup/debpkg.py @@ -1,4 +1,3 @@ -import sys import tarfile from debian import deb822 @@ -7,7 +6,7 @@ from dedup.arreader import ArReader from dedup.compression import decompress from dedup.hashing import hash_file -class MultiHash(object): +class MultiHash: def __init__(self, *hashes): self.hashes = hashes @@ -38,32 +37,11 @@ def get_tar_hashes(tar, hash_functions): hashes[hashobj.name] = hashvalue yield (elem.name, elem.size, hashes) -if sys.version_info.major >= 3: - def opentar(filelike): - return tarfile.open(fileobj=filelike, mode="r|", encoding="utf8", - errors="surrogateescape") +def opentar(filelike): + return tarfile.open(fileobj=filelike, mode="r|", encoding="utf8", + errors="surrogateescape") - def decodetarname(name): - """Decoded name of a tarinfo. - @raises UnicodeDecodeError: - """ - try: - name.encode("utf8", "strict") - except UnicodeEncodeError as e: - if e.reason == "surrogates not allowed": - name.encode("utf8", "surrogateescape").decode("utf8", "strict") - return name -else: - def opentar(filelike): - return tarfile.open(fileobj=filelike, mode="r|") - - def decodetarname(name): - """Decoded name of a tarinfo. - @raises UnicodeDecodeError: - """ - return name.decode("utf8") - -class DebExtractor(object): +class DebExtractor: "Base class for extracting desired features from a Debian package." def __init__(self): -- cgit v1.2.3 From 4542d84439bbc6bd8f3151a9cb61d0ee85cd910e Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sun, 25 Oct 2020 10:20:34 +0100 Subject: externalize ar parsing to arpy --- README | 2 +- dedup/arreader.py | 79 ------------------------------------------------------- dedup/debpkg.py | 27 +++++++------------ 3 files changed, 10 insertions(+), 98 deletions(-) delete mode 100644 dedup/arreader.py (limited to 'dedup/debpkg.py') diff --git a/README b/README index 4572c8a..ed4e8cb 100644 --- a/README +++ b/README @@ -1,7 +1,7 @@ Required packages ----------------- - aptitude install python3 python3-debian python3-lzma python3-jinja2 python3-werkzeug sqlite3 python3-pil python3-yaml python3-concurrent.futures python3-pkg-resources + aptitude install python3 python3-arpy python3-debian python3-lzma python3-jinja2 python3-werkzeug sqlite3 python3-pil python3-yaml python3-concurrent.futures python3-pkg-resources Create a database ----------------- diff --git a/dedup/arreader.py b/dedup/arreader.py deleted file mode 100644 index 8b14ff9..0000000 --- a/dedup/arreader.py +++ /dev/null @@ -1,79 +0,0 @@ -import struct - -class ArReader: - """Streaming AR file reader. After constructing an object, you usually - call read_magic once. Then you call read_entry in a loop and use the - ArReader object as file-like only providing read() to read the respective - file contents until you get EOFError from read_entry. - """ - global_magic = b"!\n" - file_magic = b"`\n" - - def __init__(self, fileobj): - """ - @param fileobj: a file-like object providing nothing but read(length) - """ - self.fileobj = fileobj - self.remaining = None - self.padding = 0 - - def read_magic(self): - """Consume the AR magic marker at the beginning of an AR file. You - must not call any other method before calling this method. - @raises ValueError: if the magic is not found - """ - data = self.fileobj.read(len(self.global_magic)) - if data != self.global_magic: - raise ValueError("ar global header not found") - self.remaining = 0 - - def read_entry(self): - """Read the next file header, return the filename and record the - length of the next file, so that the read method can be used to - exhaustively read the current file. - @rtype: bytes - @returns: the name of the next file - @raises ValueError: if the data format is wrong - @raises EOFError: when the end f the stream is reached - """ - self.skip_current_entry() - if self.padding: - if self.fileobj.read(1) != b'\n': - raise ValueError("missing ar padding") - self.padding = 0 - file_header = self.fileobj.read(60) - if not file_header: - raise EOFError("end of archive found") - parts = struct.unpack("16s 12s 6s 6s 8s 10s 2s", file_header) - parts = [p.rstrip(b"/ ") for p in parts] - if parts.pop() != self.file_magic: - raise ValueError("ar file header not found") - self.remaining = int(parts[5]) - self.padding = self.remaining % 2 - return parts[0] # name - - def skip_current_entry(self): - """Skip the remainder of the current file. This method must not be - called before calling read_entry. - @raises ValueError: if the archive appears truncated - """ - while self.remaining: - data = self.fileobj.read(min(4096, self.remaining)) - if not data: - raise ValueError("archive truncated") - self.remaining -= len(data) - - def read(self, length=None): - """ - @type length: int or None - @param length: number of bytes to read from the current file - @rtype: bytes - @returns: length or fewer bytes from the current file - """ - if length is None: - length = self.remaining - else: - length = min(self.remaining, length) - data = self.fileobj.read(length) - self.remaining -= len(data) - return data diff --git a/dedup/debpkg.py b/dedup/debpkg.py index 38086ec..0ecb123 100644 --- a/dedup/debpkg.py +++ b/dedup/debpkg.py @@ -1,8 +1,8 @@ import tarfile +import arpy from debian import deb822 -from dedup.arreader import ArReader from dedup.compression import decompress from dedup.hashing import hash_file @@ -52,45 +52,36 @@ class DebExtractor: @param filelike: is a file-like object containing the contents of the Debian packge and can be read once without seeks. """ - af = ArReader(filelike) - af.read_magic() - while True: - try: - name = af.read_entry() - except EOFError: - break - else: - self.handle_ar_member(name, af) + af = arpy.Archive(fileobj=filelike) + for member in af: + self.handle_ar_member(member) self.handle_ar_end() - def handle_ar_member(self, name, filelike): + def handle_ar_member(self, arfiledata: arpy.ArchiveFileData) -> None: """Handle an ar archive member of the Debian package. If you replace this method, you must also replace handle_ar_end and none of the methods handle_debversion, handle_control_tar or handle_data_tar are called. - @type name: bytes - @param name: is the name of the member - @param filelike: is a file-like object containing the contents of the - member and can be read once without seeks. """ + name = arfiledata.header.name if self.arstate == "start": if name != b"debian-binary": raise ValueError("debian-binary not found") - version = filelike.read() + version = arfiledata.read() self.handle_debversion(version) if not version.startswith(b"2."): raise ValueError("debian version not recognized") self.arstate = "version" elif self.arstate == "version": if name.startswith(b"control.tar"): - filelike = decompress(filelike, name[11:].decode("ascii")) + filelike = decompress(arfiledata, name[11:].decode("ascii")) self.handle_control_tar(opentar(filelike)) self.arstate = "control" elif not name.startswith(b"_"): raise ValueError("unexpected ar member %r" % name) elif self.arstate == "control": if name.startswith(b"data.tar"): - filelike = decompress(filelike, name[8:].decode("ascii")) + filelike = decompress(arfiledata, name[8:].decode("ascii")) self.handle_data_tar(opentar(filelike)) self.arstate = "data" elif not name.startswith(b"_"): -- cgit v1.2.3 From 2cb95eb8c68a692b0abb535925e8b55175285ea4 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 29 Dec 2021 15:04:35 +0100 Subject: don't return the first parameter from hash_file Returning the object gets us into trouble as to what precisely the return type is at no benefit. --- dedup/debpkg.py | 2 +- dedup/hashing.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'dedup/debpkg.py') diff --git a/dedup/debpkg.py b/dedup/debpkg.py index 0ecb123..de00e60 100644 --- a/dedup/debpkg.py +++ b/dedup/debpkg.py @@ -29,7 +29,7 @@ def get_tar_hashes(tar, hash_functions): if not elem.isreg(): # excludes hard links as well continue hasher = MultiHash(*[func() for func in hash_functions]) - hasher = hash_file(hasher, tar.extractfile(elem)) + hash_file(hasher, tar.extractfile(elem)) hashes = {} for hashobj in hasher.hashes: hashvalue = hashobj.hexdigest() diff --git a/dedup/hashing.py b/dedup/hashing.py index 21f14ea..27f303c 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -161,7 +161,6 @@ def hash_file(hashobj, filelike, blocksize=65536): while data: hashobj.update(data) data = filelike.read(blocksize) - return hashobj class HashedStream: """A file-like object, that supports sequential reading and hashes the -- cgit v1.2.3