summaryrefslogtreecommitdiff
path: root/importpkg.py
diff options
context:
space:
mode:
Diffstat (limited to 'importpkg.py')
-rwxr-xr-ximportpkg.py70
1 files changed, 33 insertions, 37 deletions
diff --git a/importpkg.py b/importpkg.py
index 54f6181..06d9da4 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -1,7 +1,7 @@
#!/usr/bin/python
-"""This tool reads a debian package from stdin and emits a yaml stream on
+"""This tool reads a Debian package from stdin and emits a yaml stream on
stdout. It does not access a database. Therefore it can be run in parallel and
-on multiple machines. The generated yaml conatins multiple documents. The first
+on multiple machines. The generated yaml contains multiple documents. The first
document contains package metadata. Then a document is emitted for each file.
And finally a document consisting of the string "commit" is emitted."""
@@ -16,26 +16,21 @@ import yaml
from dedup.arreader import ArReader
from dedup.debpkg import process_control, get_tar_hashes
-from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \
- HashedStream
+from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \
+ HashBlacklistContent
from dedup.compression import GzipDecompressor, DecompressedStream
from dedup.image import GIFHash, PNGHash
-boring_sha512_hashes = set((
- # ""
- "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e",
- # "\n"
- "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09"))
+boring_content = set(("", "\n"))
def sha512_nontrivial():
- return HashBlacklist(hashlib.sha512(), boring_sha512_hashes)
+ return HashBlacklistContent(hashlib.sha512(), boring_content)
def gziphash():
hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
hashobj.name = "gzip_sha512"
- # don't blacklist boring hashes for gzip to get gzip issues right
- return hashobj
+ return HashBlacklistContent(hashobj, boring_content)
def pnghash():
hashobj = PNGHash(hashlib.sha512())
@@ -49,6 +44,15 @@ def gifhash():
hashobj.name = "gif_sha512"
return hashobj
+def decompress_tar(filelike, extension):
+ if extension in (".lzma", ".xz"):
+ filelike = DecompressedStream(filelike, lzma.LZMADecompressor())
+ extension = ""
+ if extension not in ("", ".gz", ".bz2"):
+ raise ValueError("unknown compression format with extension %r" %
+ extension)
+ return tarfile.open(fileobj=filelike, mode="r|" + extension[1:])
+
def process_package(filelike, hash_functions):
af = ArReader(filelike)
af.read_magic()
@@ -58,11 +62,11 @@ def process_package(filelike, hash_functions):
name = af.read_entry()
except EOFError:
raise ValueError("data.tar not found")
- if name == "control.tar.gz":
+ if name.startswith("control.tar"):
if state != "start":
- raise ValueError("unexpected control.tar.gz")
+ raise ValueError("unexpected control.tar")
state = "control"
- tf = tarfile.open(fileobj=af, mode="r|gz")
+ tf = decompress_tar(af, name[11:])
for elem in tf:
if elem.name != "./control":
continue
@@ -72,28 +76,20 @@ def process_package(filelike, hash_functions):
yield process_control(tf.extractfile(elem).read())
break
continue
- elif name == "data.tar.gz":
- tf = tarfile.open(fileobj=af, mode="r|gz")
- elif name == "data.tar.bz2":
- tf = tarfile.open(fileobj=af, mode="r|bz2")
- elif name == "data.tar.xz":
- zf = DecompressedStream(af, lzma.LZMADecompressor())
- tf = tarfile.open(fileobj=zf, mode="r|")
- elif name == "data.tar":
- tf = tarfile.open(fileobj=af, mode="r|")
- else:
- continue
- if state != "control_file":
- raise ValueError("missing control file")
- for name, size, hashes in get_tar_hashes(tf, hash_functions):
- try:
- name = name.decode("utf8")
- except UnicodeDecodeError:
- print("warning: skipping filename with encoding error")
- continue # skip files with non-utf8 encoding for now
- yield dict(name=name, size=size, hashes=hashes)
- yield "commit"
- break
+ elif name.startswith("data.tar"):
+ if state != "control_file":
+ raise ValueError("missing control file")
+ state = "data"
+ tf = decompress_tar(af, name[8:])
+ for name, size, hashes in get_tar_hashes(tf, hash_functions):
+ try:
+ name = name.decode("utf8")
+ except UnicodeDecodeError:
+ print("warning: skipping filename with encoding error")
+ continue # skip files with non-utf8 encoding for now
+ yield dict(name=name, size=size, hashes=hashes)
+ yield "commit"
+ break
def process_package_with_hash(filelike, hash_functions, sha256hash):
hstream = HashedStream(filelike, hashlib.sha256())