From 36fd6fc4bd1c6930c77aa5b6408a832c1e651ef6 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Fri, 12 Dec 2014 13:28:02 +0100 Subject: full text searching on control.tar members This is a rather strange variant that has nothing to do with deduplication anymore. Instead, it enables searching a sqlite fts4 table containing all members of control.tars. --- importpkg.py | 71 ++++++++++++++++++++--------------------------------------- readyaml.py | 27 ++++++++++------------- schema.sql | 72 ++++++++---------------------------------------------------- 3 files changed, 45 insertions(+), 125 deletions(-) diff --git a/importpkg.py b/importpkg.py index 7e074e1..bef0be0 100755 --- a/importpkg.py +++ b/importpkg.py @@ -5,6 +5,7 @@ on multiple machines. The generated yaml contains multiple documents. The first document contains package metadata. Then a document is emitted for each file. And finally a document consisting of the string "commit" is emitted.""" +import binascii import hashlib import optparse import sys @@ -23,27 +24,6 @@ from dedup.image import GIFHash, PNGHash boring_content = set(("", "\n")) -def sha512_nontrivial(): - return HashBlacklistContent(hashlib.sha512(), boring_content) - -def gziphash(): - hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512()) - hashobj = SuppressingHash(hashobj, (ValueError, zlib.error)) - hashobj.name = "gzip_sha512" - return HashBlacklistContent(hashobj, boring_content) - -def pnghash(): - hashobj = PNGHash(hashlib.sha512()) - hashobj = SuppressingHash(hashobj, (ValueError,)) - hashobj.name = "png_sha512" - return hashobj - -def gifhash(): - hashobj = GIFHash(hashlib.sha512()) - hashobj = SuppressingHash(hashobj, (ValueError,)) - hashobj.name = "gif_sha512" - return hashobj - def decompress_tar(filelike, extension): if extension in (".lzma", ".xz"): filelike = DecompressedStream(filelike, lzma.LZMADecompressor()) @@ -53,47 +33,45 @@ def decompress_tar(filelike, extension): extension) return tarfile.open(fileobj=filelike, mode="r|" + extension[1:]) -def process_package(filelike, hash_functions): +def process_package(filelike): af = ArReader(filelike) af.read_magic() state = "start" + record = None while True: try: name = af.read_entry() except EOFError: - raise ValueError("data.tar not found") + raise ValueError("control.tar not found") if name.startswith("control.tar"): if state != "start": raise ValueError("unexpected control.tar") state = "control" tf = decompress_tar(af, name[11:]) + controldata = {} for elem in tf: - if elem.name not in ("./control", "control"): + if not elem.isreg(): continue - if state != "control": - raise ValueError("duplicate control file") - state = "control_file" - yield process_control(tf.extractfile(elem).read()) - break - continue - elif name.startswith("data.tar"): - if state != "control_file": - raise ValueError("missing control file") - state = "data" - tf = decompress_tar(af, name[8:]) - for name, size, hashes in get_tar_hashes(tf, hash_functions): - try: - name = name.decode("utf8") - except UnicodeDecodeError: - print("warning: skipping filename with encoding error") - continue # skip files with non-utf8 encoding for now - yield dict(name=name, size=size, hashes=hashes) + elemname = elem.name + if elemname.startswith("./"): + elemname = elemname[2:] + content = tf.extractfile(elem).read() + if elemname in controldata: + raise ValueError("duplicate entry %r in control.tar" % + elemname) + controldata[elemname] = binascii.b2a_base64(content).strip() + if elemname == "control": + record = process_control(content) + if record is None: + raise ValueError("control file missing from control.tar") + record["data"] = controldata + yield record yield "commit" break -def process_package_with_hash(filelike, hash_functions, sha256hash): +def process_package_with_hash(filelike, sha256hash): hstream = HashedStream(filelike, hashlib.sha256()) - for elem in process_package(hstream, hash_functions): + for elem in process_package(hstream): if elem == "commit": while hstream.read(4096): pass @@ -108,11 +86,10 @@ def main(): parser.add_option("-H", "--hash", action="store", help="verify that stdin hash given sha256 hash") options, args = parser.parse_args() - hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash] if options.hash: - gen = process_package_with_hash(sys.stdin, hash_functions, options.hash) + gen = process_package_with_hash(sys.stdin, options.hash) else: - gen = process_package(sys.stdin, hash_functions) + gen = process_package(sys.stdin) yaml.safe_dump_all(gen, sys.stdout) if __name__ == "__main__": diff --git a/readyaml.py b/readyaml.py index 2ef9a3b..7b75f2c 100755 --- a/readyaml.py +++ b/readyaml.py @@ -2,6 +2,7 @@ """This tool reads a yaml file as generated by importpkg.py on stdin and updates the database with the contents.""" +import binascii import optparse import sqlite3 import sys @@ -26,13 +27,11 @@ def readyaml(db, stream): pid = None cur.execute("BEGIN;") - cur.execute("SELECT name, id FROM function;") - funcmapping = dict(cur.fetchall()) if pid is not None: - cur.execute("DELETE FROM content WHERE pid = ?;", (pid,)) cur.execute("DELETE FROM dependency WHERE pid = ?;", (pid,)) cur.execute("UPDATE package SET version = ?, architecture = ?, source = ? WHERE id = ?;", (metadata["version"], metadata["architecture"], metadata["source"], pid)) + cur.execute("DELETE FROM control WHERE pid = ?;", (pid,)) else: cur.execute("INSERT INTO package (name, version, architecture, source) VALUES (?, ?, ?, ?);", (package, metadata["version"], metadata["architecture"], @@ -40,18 +39,16 @@ def readyaml(db, stream): pid = cur.lastrowid cur.executemany("INSERT INTO dependency (pid, required) VALUES (?, ?);", ((pid, dep) for dep in metadata["depends"])) - for entry in gen: - if entry == "commit": - db.commit() - return - - cur.execute("INSERT INTO content (pid, filename, size) VALUES (?, ?, ?);", - (pid, entry["name"], entry["size"])) - cid = cur.lastrowid - cur.executemany("INSERT INTO hash (cid, fid, hash) VALUES (?, ?, ?);", - ((cid, funcmapping[func], hexhash) - for func, hexhash in entry["hashes"].items())) - raise ValueError("missing commit block") + for name, content in metadata["data"].items(): + content = sqlite3.Binary(binascii.a2b_base64(content)) + cur.execute("INSERT INTO controlcontent (content) VALUES (?);", + (content,)) + docid = cur.lastrowid + cur.execute("INSERT INTO control (pid, name, cid) VALUES (?, ?, ?);", + (pid, name, docid)) + commit = next(gen) + if commit != "commit": + raise ValueError("missing commit block") def main(): parser = optparse.OptionParser() diff --git a/schema.sql b/schema.sql index 99ae7e5..b23b8e6 100644 --- a/schema.sql +++ b/schema.sql @@ -6,74 +6,20 @@ CREATE TABLE package ( -- binary Debian packages architecture TEXT, source TEXT); -- name of the source package it was built from -CREATE TABLE content ( -- a file contained in a binary package - id INTEGER PRIMARY KEY, - pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, - -- which package the file is contained in - filename TEXT NOT NULL, - size INTEGER NOT NULL); - -CREATE TABLE function ( -- hash functions - id INTEGER PRIMARY KEY, - name TEXT UNIQUE NOT NULL, - eqclass INTEGER); - -- hash values of different hash functions are comparable if they share - -- an eqclass - -INSERT INTO function (id, name, eqclass) VALUES - (1, 'sha512', 1), - (2, 'gzip_sha512', 1), - -- decompress a gzip file, then hash - (3, 'png_sha512', 2), - -- decompress a PNG file, hash RGBA image contents + dimension - (4, 'gif_sha512', 2); - -- decompress a GIF file, hash RGBA image contents + dimension - -CREATE TABLE hash ( -- hash values of files in binary packages - cid INTEGER NOT NULL REFERENCES content(id) ON DELETE CASCADE, - -- which file has been hashed - fid INTEGER NOT NULL REFERENCES function(id), - -- using which function - hash TEXT NOT NULL); -- textual hash value - CREATE TABLE dependency ( -- binary package dependencies pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, -- the package that carries a Depends: header required TEXT NOT NULL); -- the name of a package that is depended upon without alternative -CREATE INDEX content_package_size_index ON content (pid, size); -CREATE INDEX hash_cid_index ON hash (cid); -CREATE INDEX hash_hash_index ON hash (hash); - --- All tables below this line can be recomputed from the tables above. --- Recomputation is done using the update_sharing.py script. +CREATE VIRTUAL TABLE controlcontent USING fts4(content); --- The sharing table caches two values for each pair of packages pid1, pid2 and --- pair of hash functions fid1, fid2: --- * files is the number of files in pid1 that could be eliminated by reusing --- files from pid2. Since the functions may be different, this may mean --- replacing a compressed file with an uncompressed one. --- * size is the number of bytes that would be freed by doing the above. --- Note: If pid1=pid2, one copy of each file must be preserved. -CREATE TABLE sharing ( - pid1 INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, - pid2 INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, - fid1 INTEGER NOT NULL REFERENCES function(id), - fid2 INTEGER NOT NULL REFERENCES function(id), - files INTEGER NOT NULL, - size INTEGER NOT NULL); -CREATE INDEX sharing_insert_index ON sharing (pid1, pid2, fid1, fid2); - --- The duplicate table caches all files that have any non-unique hash value. --- It is used in webapp.py to speed up one query, but could be dropped --- otherwise. -CREATE TABLE duplicate ( - cid INTEGER PRIMARY KEY REFERENCES content(id) ON DELETE CASCADE); +CREATE TABLE control ( -- control.tar contents + pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, + -- the package that contains a control.tar member + name TEXT NOT NULL, + -- the name of the control.tar member without leading "./" + cid INTEGER NOT NULL); + -- a reference to the binary contents of the file --- The issue table contains auxillary notices per file. For example, when a --- filename ends in ".gz", but is not gzip decompressible (has no gzip_sha512 --- hash), it is recorded here. -CREATE TABLE issue ( - cid INTEGER NOT NULL REFERENCES content(id) ON DELETE CASCADE, - issue TEXT NOT NULL); -- a human readable comment on the file +CREATE INDEX control_name_index ON control(name); -- cgit v1.2.3