summaryrefslogtreecommitdiff
path: root/dedup/debpkg.py
diff options
context:
space:
mode:
Diffstat (limited to 'dedup/debpkg.py')
-rw-r--r--dedup/debpkg.py50
1 files changed, 26 insertions, 24 deletions
diff --git a/dedup/debpkg.py b/dedup/debpkg.py
index de00e60..0d1b7da 100644
--- a/dedup/debpkg.py
+++ b/dedup/debpkg.py
@@ -1,26 +1,29 @@
import tarfile
+import typing
import arpy
from debian import deb822
from dedup.compression import decompress
-from dedup.hashing import hash_file
+from dedup.hashing import HashlibLike, hash_file
class MultiHash:
- def __init__(self, *hashes):
+ def __init__(self, *hashes: HashlibLike):
self.hashes = hashes
- def update(self, data):
+ def update(self, data: bytes) -> None:
for hasher in self.hashes:
hasher.update(data)
-def get_tar_hashes(tar, hash_functions):
+
+def get_tar_hashes(
+ tar: tarfile.TarFile,
+ hash_functions: typing.Sequence[typing.Callable[[], HashlibLike]],
+) -> typing.Iterator[typing.Tuple[str, int, typing.Dict[str, str]]]:
"""Given a TarFile read all regular files and compute all of the given hash
functions on each file.
- @type tar: tarfile.TarFile
@param hash_functions: a sequence of parameter-less functions each creating a
new hashlib-like object
- @rtype: gen((str, int, {str: str}}
@returns: an iterable of (filename, filesize, hashes) tuples where
hashes is a dict mapping hash function names to hash values
"""
@@ -29,7 +32,9 @@ def get_tar_hashes(tar, hash_functions):
if not elem.isreg(): # excludes hard links as well
continue
hasher = MultiHash(*[func() for func in hash_functions])
- hash_file(hasher, tar.extractfile(elem))
+ extracted = tar.extractfile(elem)
+ assert extracted is not None
+ hash_file(hasher, extracted)
hashes = {}
for hashobj in hasher.hashes:
hashvalue = hashobj.hexdigest()
@@ -37,17 +42,18 @@ def get_tar_hashes(tar, hash_functions):
hashes[hashobj.name] = hashvalue
yield (elem.name, elem.size, hashes)
-def opentar(filelike):
+
+def opentar(filelike: typing.BinaryIO) -> tarfile.TarFile:
return tarfile.open(fileobj=filelike, mode="r|", encoding="utf8",
errors="surrogateescape")
class DebExtractor:
"Base class for extracting desired features from a Debian package."
- def __init__(self):
+ def __init__(self) -> None:
self.arstate = "start"
- def process(self, filelike):
+ def process(self, filelike: typing.BinaryIO) -> None:
"""Process a Debian package.
@param filelike: is a file-like object containing the contents of the
Debian packge and can be read once without seeks.
@@ -89,22 +95,20 @@ class DebExtractor:
else:
assert self.arstate == "data"
- def handle_ar_end(self):
+ def handle_ar_end(self) -> None:
"Handle the end of the ar archive of the Debian package."
if self.arstate != "data":
raise ValueError("data.tar not found")
- def handle_debversion(self, version):
+ def handle_debversion(self, version: bytes) -> None:
"""Handle the debian-binary member of the Debian package.
- @type version: bytes
@param version: The full contents of the ar member.
"""
- def handle_control_tar(self, tarfileobj):
+ def handle_control_tar(self, tarfileobj: tarfile.TarFile) -> None:
"""Handle the control.tar member of the Debian package.
If you replace this method, none of handle_control_member,
handle_control_info or handle_control_end are called.
- @type tarfileobj: tarfile.TarFile
@param tarfile: is opened for streaming reads
"""
controlseen = False
@@ -113,7 +117,9 @@ class DebExtractor:
name = elem.name
if name.startswith("./"):
name = name[2:]
- content = tarfileobj.extractfile(elem).read()
+ extracted = tarfileobj.extractfile(elem)
+ assert extracted is not None
+ content = extracted.read()
self.handle_control_member(name, content)
if name == "control":
self.handle_control_info(deb822.Packages(content))
@@ -125,24 +131,20 @@ class DebExtractor:
raise ValueError("control missing from control.tar")
self.handle_control_end()
- def handle_control_member(self, name, content):
+ def handle_control_member(self, name: str, content: bytes) -> None:
"""Handle a file member of the control.tar member of the Debian package.
- @type name: str
@param name: is the plain member name
- @type content: bytes
"""
- def handle_control_info(self, info):
+ def handle_control_info(self, info: deb822.Packages) -> None:
"""Handle the control member of the control.tar member of the Debian
package.
- @type info: deb822.Packages
"""
- def handle_control_end(self):
+ def handle_control_end(self) -> None:
"Handle the end of the control.tar member of the Debian package."
- def handle_data_tar(self, tarfileobj):
+ def handle_data_tar(self, tarfileobj: tarfile.TarFile) -> None:
"""Handle the data.tar member of the Debian package.
- @type tarfileobj: tarfile.TarFile
@param tarfile: is opened for streaming reads
"""