summaryrefslogtreecommitdiff
path: root/importpkg.py
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2014-03-08 12:39:32 +0100
committerHelmut Grohne <helmut@subdivi.de>2014-03-08 12:39:32 +0100
commitc6a30cefff55cd247a47fa0a2d4f819592e1202b (patch)
tree58b6ff52bc6827782c2973f1ce976e245ce5f34c /importpkg.py
parent751f19ec1107c9059ae4834e4b757741ebee6cbd (diff)
parentbb0aea9971bc79d8787d8f034022d0ca803fcab3 (diff)
downloaddebian-dedup-c6a30cefff55cd247a47fa0a2d4f819592e1202b.tar.gz
Merge branch 'master' into sqlalchemy
In the mean time, the master branch evolved quite a bit and the schema changed again (eqclass added to function table). The main reason for the merge is to resolve the large amounts of conflicts once, so development of the sqlalchemy branch can continue and still benefit from changes in the master branch such as schema compatibility, adapting the indent level in web app due to the use of contextlib.closing which resembles sqlalchemy's "with db.begin() as conn:". Conflicts: autoimport.py dedup/utils.py readyaml.py update_sharing.py webapp.py
Diffstat (limited to 'importpkg.py')
-rwxr-xr-ximportpkg.py71
1 files changed, 15 insertions, 56 deletions
diff --git a/importpkg.py b/importpkg.py
index 182ca01..aeccda5 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -1,7 +1,7 @@
#!/usr/bin/python
-"""This tool reads a debian package from stdin and emits a yaml stream on
+"""This tool reads a Debian package from stdin and emits a yaml stream on
stdout. It does not access a database. Therefore it can be run in parallel and
-on multiple machines. The generated yaml conatins multiple documents. The first
+on multiple machines. The generated yaml contains multiple documents. The first
document contains package metadata. Then a document is emitted for each file.
And finally a document consisting of the string "commit" is emitted."""
@@ -11,38 +11,26 @@ import sys
import tarfile
import zlib
-from debian import deb822
import lzma
import yaml
from dedup.arreader import ArReader
-from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \
- HashedStream, hash_file
+from dedup.debpkg import process_control, get_tar_hashes
+from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \
+ HashBlacklistContent
from dedup.compression import GzipDecompressor, DecompressedStream
from dedup.image import GIFHash, PNGHash
-class MultiHash(object):
- def __init__(self, *hashes):
- self.hashes = hashes
-
- def update(self, data):
- for hasher in self.hashes:
- hasher.update(data)
-
-boring_sha512_hashes = set((
- # ""
- "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e",
- # "\n"
- "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09"))
+boring_content = set(("", "\n"))
def sha512_nontrivial():
- return HashBlacklist(hashlib.sha512(), boring_sha512_hashes)
+ return HashBlacklistContent(hashlib.sha512(), boring_content)
def gziphash():
hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
hashobj.name = "gzip_sha512"
- return HashBlacklist(hashobj, boring_sha512_hashes)
+ return HashBlacklistContent(hashobj, boring_content)
def pnghash():
hashobj = PNGHash(hashlib.sha512())
@@ -56,37 +44,7 @@ def gifhash():
hashobj.name = "gif_sha512"
return hashobj
-def get_hashes(tar):
- for elem in tar:
- if not elem.isreg(): # excludes hard links as well
- continue
- hasher = MultiHash(sha512_nontrivial(), gziphash(), pnghash(),
- gifhash())
- hasher = hash_file(hasher, tar.extractfile(elem))
- hashes = {}
- for hashobj in hasher.hashes:
- hashvalue = hashobj.hexdigest()
- if hashvalue:
- hashes[hashobj.name] = hashvalue
- yield (elem.name, elem.size, hashes)
-
-def process_control(control_contents):
- control = deb822.Packages(control_contents)
- package = control["package"].encode("ascii")
- try:
- source = control["source"].encode("ascii").split()[0]
- except KeyError:
- source = package
- version = control["version"].encode("ascii")
- architecture = control["architecture"].encode("ascii")
-
- depends = set(dep[0]["name"].encode("ascii")
- for dep in control.relations.get("depends", ())
- if len(dep) == 1)
- return dict(package=package, source=source, version=version,
- architecture=architecture, depends=depends)
-
-def process_package(filelike):
+def process_package(filelike, hash_functions):
af = ArReader(filelike)
af.read_magic()
state = "start"
@@ -122,7 +80,7 @@ def process_package(filelike):
continue
if state != "control_file":
raise ValueError("missing control file")
- for name, size, hashes in get_hashes(tf):
+ for name, size, hashes in get_tar_hashes(tf, hash_functions):
try:
name = name.decode("utf8")
except UnicodeDecodeError:
@@ -132,9 +90,9 @@ def process_package(filelike):
yield "commit"
break
-def process_package_with_hash(filelike, sha256hash):
+def process_package_with_hash(filelike, hash_functions, sha256hash):
hstream = HashedStream(filelike, hashlib.sha256())
- for elem in process_package(hstream):
+ for elem in process_package(hstream, hash_functions):
if elem == "commit":
while hstream.read(4096):
pass
@@ -149,10 +107,11 @@ def main():
parser.add_option("-H", "--hash", action="store",
help="verify that stdin hash given sha256 hash")
options, args = parser.parse_args()
+ hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash]
if options.hash:
- gen = process_package_with_hash(sys.stdin, options.hash)
+ gen = process_package_with_hash(sys.stdin, hash_functions, options.hash)
else:
- gen = process_package(sys.stdin)
+ gen = process_package(sys.stdin, hash_functions)
yaml.safe_dump_all(gen, sys.stdout)
if __name__ == "__main__":