diff options
author | Helmut Grohne <helmut@subdivi.de> | 2014-03-08 12:39:32 +0100 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2014-03-08 12:39:32 +0100 |
commit | c6a30cefff55cd247a47fa0a2d4f819592e1202b (patch) | |
tree | 58b6ff52bc6827782c2973f1ce976e245ce5f34c /dedup/debpkg.py | |
parent | 751f19ec1107c9059ae4834e4b757741ebee6cbd (diff) | |
parent | bb0aea9971bc79d8787d8f034022d0ca803fcab3 (diff) | |
download | debian-dedup-c6a30cefff55cd247a47fa0a2d4f819592e1202b.tar.gz |
Merge branch 'master' into sqlalchemy
In the mean time, the master branch evolved quite a bit and the schema
changed again (eqclass added to function table). The main reason for the
merge is to resolve the large amounts of conflicts once, so development
of the sqlalchemy branch can continue and still benefit from changes in
the master branch such as schema compatibility, adapting the indent
level in web app due to the use of contextlib.closing which resembles
sqlalchemy's "with db.begin() as conn:".
Conflicts:
autoimport.py
dedup/utils.py
readyaml.py
update_sharing.py
webapp.py
Diffstat (limited to 'dedup/debpkg.py')
-rw-r--r-- | dedup/debpkg.py | 55 |
1 files changed, 55 insertions, 0 deletions
diff --git a/dedup/debpkg.py b/dedup/debpkg.py new file mode 100644 index 0000000..2d67135 --- /dev/null +++ b/dedup/debpkg.py @@ -0,0 +1,55 @@ +from debian import deb822 + +from dedup.hashing import hash_file + +def process_control(control_contents): + """Parses the contents of a control file from a control.tar.gz of a Debian + package and returns a dictionary containing the fields relevant to dedup. + @type control_contents: bytes + @rtype: {str: object} + """ + control = deb822.Packages(control_contents) + package = control["package"].encode("ascii") + try: + source = control["source"].encode("ascii").split()[0] + except KeyError: + source = package + version = control["version"].encode("ascii") + architecture = control["architecture"].encode("ascii") + # deb822 currently returns :any dependencies raw. see #670679 + depends = set(dep[0]["name"].split(u':', 1)[0].encode("ascii") + for dep in control.relations.get("depends", ()) + if len(dep) == 1) + return dict(package=package, source=source, version=version, + architecture=architecture, depends=depends) + +class MultiHash(object): + def __init__(self, *hashes): + self.hashes = hashes + + def update(self, data): + for hasher in self.hashes: + hasher.update(data) + +def get_tar_hashes(tar, hash_functions): + """Given a TarFile read all regular files and compute all of the given hash + functions on each file. + @type tar: tarfile.TarFile + @param hash_functions: a sequence of parameter-less functions each creating a + new hashlib-like object + @rtype: gen((str, int, {str: str}} + @returns: an iterable of (filename, filesize, hashes) tuples where + hashes is a dict mapping hash function names to hash values + """ + + for elem in tar: + if not elem.isreg(): # excludes hard links as well + continue + hasher = MultiHash(*[func() for func in hash_functions]) + hasher = hash_file(hasher, tar.extractfile(elem)) + hashes = {} + for hashobj in hasher.hashes: + hashvalue = hashobj.hexdigest() + if hashvalue: + hashes[hashobj.name] = hashvalue + yield (elem.name, elem.size, hashes) |