summaryrefslogtreecommitdiff
path: root/importpkg.py
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2014-12-12 13:28:02 +0100
committerHelmut Grohne <helmut@subdivi.de>2014-12-12 13:28:02 +0100
commit36fd6fc4bd1c6930c77aa5b6408a832c1e651ef6 (patch)
treeb1a9fcedb51aee454f42e5d1afa8b94db0ccafea /importpkg.py
parent2a728ab85e1ddfeec03514f86f706c116ca94440 (diff)
downloaddebian-dedup-36fd6fc4bd1c6930c77aa5b6408a832c1e651ef6.tar.gz
full text searching on control.tar memberscontroldata
This is a rather strange variant that has nothing to do with deduplication anymore. Instead, it enables searching a sqlite fts4 table containing all members of control.tars.
Diffstat (limited to 'importpkg.py')
-rwxr-xr-ximportpkg.py71
1 files changed, 24 insertions, 47 deletions
diff --git a/importpkg.py b/importpkg.py
index 7e074e1..bef0be0 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -5,6 +5,7 @@ on multiple machines. The generated yaml contains multiple documents. The first
document contains package metadata. Then a document is emitted for each file.
And finally a document consisting of the string "commit" is emitted."""
+import binascii
import hashlib
import optparse
import sys
@@ -23,27 +24,6 @@ from dedup.image import GIFHash, PNGHash
boring_content = set(("", "\n"))
-def sha512_nontrivial():
- return HashBlacklistContent(hashlib.sha512(), boring_content)
-
-def gziphash():
- hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
- hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
- hashobj.name = "gzip_sha512"
- return HashBlacklistContent(hashobj, boring_content)
-
-def pnghash():
- hashobj = PNGHash(hashlib.sha512())
- hashobj = SuppressingHash(hashobj, (ValueError,))
- hashobj.name = "png_sha512"
- return hashobj
-
-def gifhash():
- hashobj = GIFHash(hashlib.sha512())
- hashobj = SuppressingHash(hashobj, (ValueError,))
- hashobj.name = "gif_sha512"
- return hashobj
-
def decompress_tar(filelike, extension):
if extension in (".lzma", ".xz"):
filelike = DecompressedStream(filelike, lzma.LZMADecompressor())
@@ -53,47 +33,45 @@ def decompress_tar(filelike, extension):
extension)
return tarfile.open(fileobj=filelike, mode="r|" + extension[1:])
-def process_package(filelike, hash_functions):
+def process_package(filelike):
af = ArReader(filelike)
af.read_magic()
state = "start"
+ record = None
while True:
try:
name = af.read_entry()
except EOFError:
- raise ValueError("data.tar not found")
+ raise ValueError("control.tar not found")
if name.startswith("control.tar"):
if state != "start":
raise ValueError("unexpected control.tar")
state = "control"
tf = decompress_tar(af, name[11:])
+ controldata = {}
for elem in tf:
- if elem.name not in ("./control", "control"):
+ if not elem.isreg():
continue
- if state != "control":
- raise ValueError("duplicate control file")
- state = "control_file"
- yield process_control(tf.extractfile(elem).read())
- break
- continue
- elif name.startswith("data.tar"):
- if state != "control_file":
- raise ValueError("missing control file")
- state = "data"
- tf = decompress_tar(af, name[8:])
- for name, size, hashes in get_tar_hashes(tf, hash_functions):
- try:
- name = name.decode("utf8")
- except UnicodeDecodeError:
- print("warning: skipping filename with encoding error")
- continue # skip files with non-utf8 encoding for now
- yield dict(name=name, size=size, hashes=hashes)
+ elemname = elem.name
+ if elemname.startswith("./"):
+ elemname = elemname[2:]
+ content = tf.extractfile(elem).read()
+ if elemname in controldata:
+ raise ValueError("duplicate entry %r in control.tar" %
+ elemname)
+ controldata[elemname] = binascii.b2a_base64(content).strip()
+ if elemname == "control":
+ record = process_control(content)
+ if record is None:
+ raise ValueError("control file missing from control.tar")
+ record["data"] = controldata
+ yield record
yield "commit"
break
-def process_package_with_hash(filelike, hash_functions, sha256hash):
+def process_package_with_hash(filelike, sha256hash):
hstream = HashedStream(filelike, hashlib.sha256())
- for elem in process_package(hstream, hash_functions):
+ for elem in process_package(hstream):
if elem == "commit":
while hstream.read(4096):
pass
@@ -108,11 +86,10 @@ def main():
parser.add_option("-H", "--hash", action="store",
help="verify that stdin hash given sha256 hash")
options, args = parser.parse_args()
- hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash]
if options.hash:
- gen = process_package_with_hash(sys.stdin, hash_functions, options.hash)
+ gen = process_package_with_hash(sys.stdin, options.hash)
else:
- gen = process_package(sys.stdin, hash_functions)
+ gen = process_package(sys.stdin)
yaml.safe_dump_all(gen, sys.stdout)
if __name__ == "__main__":