summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--dedup/debpkg.py55
-rwxr-xr-ximportpkg.py55
2 files changed, 64 insertions, 46 deletions
diff --git a/dedup/debpkg.py b/dedup/debpkg.py
new file mode 100644
index 0000000..d8cc22f
--- /dev/null
+++ b/dedup/debpkg.py
@@ -0,0 +1,55 @@
+from debian import deb822
+
+from dedup.hashing import hash_file
+
+def process_control(control_contents):
+ """Parses the contents of a control file from a control.tar.gz of a Debian
+ package and returns a dictionary containing the fields relevant to dedup.
+ @type control_contents: bytes
+ @rtype: {str: object}
+ """
+ control = deb822.Packages(control_contents)
+ package = control["package"].encode("ascii")
+ try:
+ source = control["source"].encode("ascii").split()[0]
+ except KeyError:
+ source = package
+ version = control["version"].encode("ascii")
+ architecture = control["architecture"].encode("ascii")
+
+ depends = set(dep[0]["name"].encode("ascii")
+ for dep in control.relations.get("depends", ())
+ if len(dep) == 1)
+ return dict(package=package, source=source, version=version,
+ architecture=architecture, depends=depends)
+
+class MultiHash(object):
+ def __init__(self, *hashes):
+ self.hashes = hashes
+
+ def update(self, data):
+ for hasher in self.hashes:
+ hasher.update(data)
+
+def get_tar_hashes(tar, hash_functions):
+ """Given a TarFile read all regular files and compute all of the given hash
+ functions on each file.
+ @type tar: tarfile.TarFile
+ @param hash_functions: a sequence of parameter-less functions each creating a
+ new hashlib-like object
+ @rtype: gen((str, int, {str: str}}
+ @returns: an iterable of (filename, filesize, hashes) tuples where
+ hashes is a dict mapping hash function names to hash values
+ """
+
+ for elem in tar:
+ if not elem.isreg(): # excludes hard links as well
+ continue
+ hasher = MultiHash(*[func() for func in hash_functions])
+ hasher = hash_file(hasher, tar.extractfile(elem))
+ hashes = {}
+ for hashobj in hasher.hashes:
+ hashvalue = hashobj.hexdigest()
+ if hashvalue:
+ hashes[hashobj.name] = hashvalue
+ yield (elem.name, elem.size, hashes)
diff --git a/importpkg.py b/importpkg.py
index 1334dd6..54f6181 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -11,24 +11,16 @@ import sys
import tarfile
import zlib
-from debian import deb822
import lzma
import yaml
from dedup.arreader import ArReader
+from dedup.debpkg import process_control, get_tar_hashes
from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \
- HashedStream, hash_file
+ HashedStream
from dedup.compression import GzipDecompressor, DecompressedStream
from dedup.image import GIFHash, PNGHash
-class MultiHash(object):
- def __init__(self, *hashes):
- self.hashes = hashes
-
- def update(self, data):
- for hasher in self.hashes:
- hasher.update(data)
-
boring_sha512_hashes = set((
# ""
"cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e",
@@ -57,37 +49,7 @@ def gifhash():
hashobj.name = "gif_sha512"
return hashobj
-def get_hashes(tar):
- for elem in tar:
- if not elem.isreg(): # excludes hard links as well
- continue
- hasher = MultiHash(sha512_nontrivial(), gziphash(), pnghash(),
- gifhash())
- hasher = hash_file(hasher, tar.extractfile(elem))
- hashes = {}
- for hashobj in hasher.hashes:
- hashvalue = hashobj.hexdigest()
- if hashvalue:
- hashes[hashobj.name] = hashvalue
- yield (elem.name, elem.size, hashes)
-
-def process_control(control_contents):
- control = deb822.Packages(control_contents)
- package = control["package"].encode("ascii")
- try:
- source = control["source"].encode("ascii").split()[0]
- except KeyError:
- source = package
- version = control["version"].encode("ascii")
- architecture = control["architecture"].encode("ascii")
-
- depends = set(dep[0]["name"].encode("ascii")
- for dep in control.relations.get("depends", ())
- if len(dep) == 1)
- return dict(package=package, source=source, version=version,
- architecture=architecture, depends=depends)
-
-def process_package(filelike):
+def process_package(filelike, hash_functions):
af = ArReader(filelike)
af.read_magic()
state = "start"
@@ -123,7 +85,7 @@ def process_package(filelike):
continue
if state != "control_file":
raise ValueError("missing control file")
- for name, size, hashes in get_hashes(tf):
+ for name, size, hashes in get_tar_hashes(tf, hash_functions):
try:
name = name.decode("utf8")
except UnicodeDecodeError:
@@ -133,9 +95,9 @@ def process_package(filelike):
yield "commit"
break
-def process_package_with_hash(filelike, sha256hash):
+def process_package_with_hash(filelike, hash_functions, sha256hash):
hstream = HashedStream(filelike, hashlib.sha256())
- for elem in process_package(hstream):
+ for elem in process_package(hstream, hash_functions):
if elem == "commit":
while hstream.read(4096):
pass
@@ -150,10 +112,11 @@ def main():
parser.add_option("-H", "--hash", action="store",
help="verify that stdin hash given sha256 hash")
options, args = parser.parse_args()
+ hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash]
if options.hash:
- gen = process_package_with_hash(sys.stdin, options.hash)
+ gen = process_package_with_hash(sys.stdin, hash_functions, options.hash)
else:
- gen = process_package(sys.stdin)
+ gen = process_package(sys.stdin, hash_functions)
yaml.safe_dump_all(gen, sys.stdout)
if __name__ == "__main__":