diff options
-rw-r--r-- | dedup/debpkg.py | 55 | ||||
-rwxr-xr-x | importpkg.py | 55 |
2 files changed, 64 insertions, 46 deletions
diff --git a/dedup/debpkg.py b/dedup/debpkg.py new file mode 100644 index 0000000..d8cc22f --- /dev/null +++ b/dedup/debpkg.py @@ -0,0 +1,55 @@ +from debian import deb822 + +from dedup.hashing import hash_file + +def process_control(control_contents): + """Parses the contents of a control file from a control.tar.gz of a Debian + package and returns a dictionary containing the fields relevant to dedup. + @type control_contents: bytes + @rtype: {str: object} + """ + control = deb822.Packages(control_contents) + package = control["package"].encode("ascii") + try: + source = control["source"].encode("ascii").split()[0] + except KeyError: + source = package + version = control["version"].encode("ascii") + architecture = control["architecture"].encode("ascii") + + depends = set(dep[0]["name"].encode("ascii") + for dep in control.relations.get("depends", ()) + if len(dep) == 1) + return dict(package=package, source=source, version=version, + architecture=architecture, depends=depends) + +class MultiHash(object): + def __init__(self, *hashes): + self.hashes = hashes + + def update(self, data): + for hasher in self.hashes: + hasher.update(data) + +def get_tar_hashes(tar, hash_functions): + """Given a TarFile read all regular files and compute all of the given hash + functions on each file. + @type tar: tarfile.TarFile + @param hash_functions: a sequence of parameter-less functions each creating a + new hashlib-like object + @rtype: gen((str, int, {str: str}} + @returns: an iterable of (filename, filesize, hashes) tuples where + hashes is a dict mapping hash function names to hash values + """ + + for elem in tar: + if not elem.isreg(): # excludes hard links as well + continue + hasher = MultiHash(*[func() for func in hash_functions]) + hasher = hash_file(hasher, tar.extractfile(elem)) + hashes = {} + for hashobj in hasher.hashes: + hashvalue = hashobj.hexdigest() + if hashvalue: + hashes[hashobj.name] = hashvalue + yield (elem.name, elem.size, hashes) diff --git a/importpkg.py b/importpkg.py index 1334dd6..54f6181 100755 --- a/importpkg.py +++ b/importpkg.py @@ -11,24 +11,16 @@ import sys import tarfile import zlib -from debian import deb822 import lzma import yaml from dedup.arreader import ArReader +from dedup.debpkg import process_control, get_tar_hashes from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \ - HashedStream, hash_file + HashedStream from dedup.compression import GzipDecompressor, DecompressedStream from dedup.image import GIFHash, PNGHash -class MultiHash(object): - def __init__(self, *hashes): - self.hashes = hashes - - def update(self, data): - for hasher in self.hashes: - hasher.update(data) - boring_sha512_hashes = set(( # "" "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e", @@ -57,37 +49,7 @@ def gifhash(): hashobj.name = "gif_sha512" return hashobj -def get_hashes(tar): - for elem in tar: - if not elem.isreg(): # excludes hard links as well - continue - hasher = MultiHash(sha512_nontrivial(), gziphash(), pnghash(), - gifhash()) - hasher = hash_file(hasher, tar.extractfile(elem)) - hashes = {} - for hashobj in hasher.hashes: - hashvalue = hashobj.hexdigest() - if hashvalue: - hashes[hashobj.name] = hashvalue - yield (elem.name, elem.size, hashes) - -def process_control(control_contents): - control = deb822.Packages(control_contents) - package = control["package"].encode("ascii") - try: - source = control["source"].encode("ascii").split()[0] - except KeyError: - source = package - version = control["version"].encode("ascii") - architecture = control["architecture"].encode("ascii") - - depends = set(dep[0]["name"].encode("ascii") - for dep in control.relations.get("depends", ()) - if len(dep) == 1) - return dict(package=package, source=source, version=version, - architecture=architecture, depends=depends) - -def process_package(filelike): +def process_package(filelike, hash_functions): af = ArReader(filelike) af.read_magic() state = "start" @@ -123,7 +85,7 @@ def process_package(filelike): continue if state != "control_file": raise ValueError("missing control file") - for name, size, hashes in get_hashes(tf): + for name, size, hashes in get_tar_hashes(tf, hash_functions): try: name = name.decode("utf8") except UnicodeDecodeError: @@ -133,9 +95,9 @@ def process_package(filelike): yield "commit" break -def process_package_with_hash(filelike, sha256hash): +def process_package_with_hash(filelike, hash_functions, sha256hash): hstream = HashedStream(filelike, hashlib.sha256()) - for elem in process_package(hstream): + for elem in process_package(hstream, hash_functions): if elem == "commit": while hstream.read(4096): pass @@ -150,10 +112,11 @@ def main(): parser.add_option("-H", "--hash", action="store", help="verify that stdin hash given sha256 hash") options, args = parser.parse_args() + hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash] if options.hash: - gen = process_package_with_hash(sys.stdin, options.hash) + gen = process_package_with_hash(sys.stdin, hash_functions, options.hash) else: - gen = process_package(sys.stdin) + gen = process_package(sys.stdin, hash_functions) yaml.safe_dump_all(gen, sys.stdout) if __name__ == "__main__": |