summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-ximportpkg.py71
-rwxr-xr-xreadyaml.py27
-rw-r--r--schema.sql72
3 files changed, 45 insertions, 125 deletions
diff --git a/importpkg.py b/importpkg.py
index 7e074e1..bef0be0 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -5,6 +5,7 @@ on multiple machines. The generated yaml contains multiple documents. The first
document contains package metadata. Then a document is emitted for each file.
And finally a document consisting of the string "commit" is emitted."""
+import binascii
import hashlib
import optparse
import sys
@@ -23,27 +24,6 @@ from dedup.image import GIFHash, PNGHash
boring_content = set(("", "\n"))
-def sha512_nontrivial():
- return HashBlacklistContent(hashlib.sha512(), boring_content)
-
-def gziphash():
- hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
- hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
- hashobj.name = "gzip_sha512"
- return HashBlacklistContent(hashobj, boring_content)
-
-def pnghash():
- hashobj = PNGHash(hashlib.sha512())
- hashobj = SuppressingHash(hashobj, (ValueError,))
- hashobj.name = "png_sha512"
- return hashobj
-
-def gifhash():
- hashobj = GIFHash(hashlib.sha512())
- hashobj = SuppressingHash(hashobj, (ValueError,))
- hashobj.name = "gif_sha512"
- return hashobj
-
def decompress_tar(filelike, extension):
if extension in (".lzma", ".xz"):
filelike = DecompressedStream(filelike, lzma.LZMADecompressor())
@@ -53,47 +33,45 @@ def decompress_tar(filelike, extension):
extension)
return tarfile.open(fileobj=filelike, mode="r|" + extension[1:])
-def process_package(filelike, hash_functions):
+def process_package(filelike):
af = ArReader(filelike)
af.read_magic()
state = "start"
+ record = None
while True:
try:
name = af.read_entry()
except EOFError:
- raise ValueError("data.tar not found")
+ raise ValueError("control.tar not found")
if name.startswith("control.tar"):
if state != "start":
raise ValueError("unexpected control.tar")
state = "control"
tf = decompress_tar(af, name[11:])
+ controldata = {}
for elem in tf:
- if elem.name not in ("./control", "control"):
+ if not elem.isreg():
continue
- if state != "control":
- raise ValueError("duplicate control file")
- state = "control_file"
- yield process_control(tf.extractfile(elem).read())
- break
- continue
- elif name.startswith("data.tar"):
- if state != "control_file":
- raise ValueError("missing control file")
- state = "data"
- tf = decompress_tar(af, name[8:])
- for name, size, hashes in get_tar_hashes(tf, hash_functions):
- try:
- name = name.decode("utf8")
- except UnicodeDecodeError:
- print("warning: skipping filename with encoding error")
- continue # skip files with non-utf8 encoding for now
- yield dict(name=name, size=size, hashes=hashes)
+ elemname = elem.name
+ if elemname.startswith("./"):
+ elemname = elemname[2:]
+ content = tf.extractfile(elem).read()
+ if elemname in controldata:
+ raise ValueError("duplicate entry %r in control.tar" %
+ elemname)
+ controldata[elemname] = binascii.b2a_base64(content).strip()
+ if elemname == "control":
+ record = process_control(content)
+ if record is None:
+ raise ValueError("control file missing from control.tar")
+ record["data"] = controldata
+ yield record
yield "commit"
break
-def process_package_with_hash(filelike, hash_functions, sha256hash):
+def process_package_with_hash(filelike, sha256hash):
hstream = HashedStream(filelike, hashlib.sha256())
- for elem in process_package(hstream, hash_functions):
+ for elem in process_package(hstream):
if elem == "commit":
while hstream.read(4096):
pass
@@ -108,11 +86,10 @@ def main():
parser.add_option("-H", "--hash", action="store",
help="verify that stdin hash given sha256 hash")
options, args = parser.parse_args()
- hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash]
if options.hash:
- gen = process_package_with_hash(sys.stdin, hash_functions, options.hash)
+ gen = process_package_with_hash(sys.stdin, options.hash)
else:
- gen = process_package(sys.stdin, hash_functions)
+ gen = process_package(sys.stdin)
yaml.safe_dump_all(gen, sys.stdout)
if __name__ == "__main__":
diff --git a/readyaml.py b/readyaml.py
index 2ef9a3b..7b75f2c 100755
--- a/readyaml.py
+++ b/readyaml.py
@@ -2,6 +2,7 @@
"""This tool reads a yaml file as generated by importpkg.py on stdin and
updates the database with the contents."""
+import binascii
import optparse
import sqlite3
import sys
@@ -26,13 +27,11 @@ def readyaml(db, stream):
pid = None
cur.execute("BEGIN;")
- cur.execute("SELECT name, id FROM function;")
- funcmapping = dict(cur.fetchall())
if pid is not None:
- cur.execute("DELETE FROM content WHERE pid = ?;", (pid,))
cur.execute("DELETE FROM dependency WHERE pid = ?;", (pid,))
cur.execute("UPDATE package SET version = ?, architecture = ?, source = ? WHERE id = ?;",
(metadata["version"], metadata["architecture"], metadata["source"], pid))
+ cur.execute("DELETE FROM control WHERE pid = ?;", (pid,))
else:
cur.execute("INSERT INTO package (name, version, architecture, source) VALUES (?, ?, ?, ?);",
(package, metadata["version"], metadata["architecture"],
@@ -40,18 +39,16 @@ def readyaml(db, stream):
pid = cur.lastrowid
cur.executemany("INSERT INTO dependency (pid, required) VALUES (?, ?);",
((pid, dep) for dep in metadata["depends"]))
- for entry in gen:
- if entry == "commit":
- db.commit()
- return
-
- cur.execute("INSERT INTO content (pid, filename, size) VALUES (?, ?, ?);",
- (pid, entry["name"], entry["size"]))
- cid = cur.lastrowid
- cur.executemany("INSERT INTO hash (cid, fid, hash) VALUES (?, ?, ?);",
- ((cid, funcmapping[func], hexhash)
- for func, hexhash in entry["hashes"].items()))
- raise ValueError("missing commit block")
+ for name, content in metadata["data"].items():
+ content = sqlite3.Binary(binascii.a2b_base64(content))
+ cur.execute("INSERT INTO controlcontent (content) VALUES (?);",
+ (content,))
+ docid = cur.lastrowid
+ cur.execute("INSERT INTO control (pid, name, cid) VALUES (?, ?, ?);",
+ (pid, name, docid))
+ commit = next(gen)
+ if commit != "commit":
+ raise ValueError("missing commit block")
def main():
parser = optparse.OptionParser()
diff --git a/schema.sql b/schema.sql
index 99ae7e5..b23b8e6 100644
--- a/schema.sql
+++ b/schema.sql
@@ -6,74 +6,20 @@ CREATE TABLE package ( -- binary Debian packages
architecture TEXT,
source TEXT); -- name of the source package it was built from
-CREATE TABLE content ( -- a file contained in a binary package
- id INTEGER PRIMARY KEY,
- pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
- -- which package the file is contained in
- filename TEXT NOT NULL,
- size INTEGER NOT NULL);
-
-CREATE TABLE function ( -- hash functions
- id INTEGER PRIMARY KEY,
- name TEXT UNIQUE NOT NULL,
- eqclass INTEGER);
- -- hash values of different hash functions are comparable if they share
- -- an eqclass
-
-INSERT INTO function (id, name, eqclass) VALUES
- (1, 'sha512', 1),
- (2, 'gzip_sha512', 1),
- -- decompress a gzip file, then hash
- (3, 'png_sha512', 2),
- -- decompress a PNG file, hash RGBA image contents + dimension
- (4, 'gif_sha512', 2);
- -- decompress a GIF file, hash RGBA image contents + dimension
-
-CREATE TABLE hash ( -- hash values of files in binary packages
- cid INTEGER NOT NULL REFERENCES content(id) ON DELETE CASCADE,
- -- which file has been hashed
- fid INTEGER NOT NULL REFERENCES function(id),
- -- using which function
- hash TEXT NOT NULL); -- textual hash value
-
CREATE TABLE dependency ( -- binary package dependencies
pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
-- the package that carries a Depends: header
required TEXT NOT NULL);
-- the name of a package that is depended upon without alternative
-CREATE INDEX content_package_size_index ON content (pid, size);
-CREATE INDEX hash_cid_index ON hash (cid);
-CREATE INDEX hash_hash_index ON hash (hash);
-
--- All tables below this line can be recomputed from the tables above.
--- Recomputation is done using the update_sharing.py script.
+CREATE VIRTUAL TABLE controlcontent USING fts4(content);
--- The sharing table caches two values for each pair of packages pid1, pid2 and
--- pair of hash functions fid1, fid2:
--- * files is the number of files in pid1 that could be eliminated by reusing
--- files from pid2. Since the functions may be different, this may mean
--- replacing a compressed file with an uncompressed one.
--- * size is the number of bytes that would be freed by doing the above.
--- Note: If pid1=pid2, one copy of each file must be preserved.
-CREATE TABLE sharing (
- pid1 INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
- pid2 INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
- fid1 INTEGER NOT NULL REFERENCES function(id),
- fid2 INTEGER NOT NULL REFERENCES function(id),
- files INTEGER NOT NULL,
- size INTEGER NOT NULL);
-CREATE INDEX sharing_insert_index ON sharing (pid1, pid2, fid1, fid2);
-
--- The duplicate table caches all files that have any non-unique hash value.
--- It is used in webapp.py to speed up one query, but could be dropped
--- otherwise.
-CREATE TABLE duplicate (
- cid INTEGER PRIMARY KEY REFERENCES content(id) ON DELETE CASCADE);
+CREATE TABLE control ( -- control.tar contents
+ pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
+ -- the package that contains a control.tar member
+ name TEXT NOT NULL,
+ -- the name of the control.tar member without leading "./"
+ cid INTEGER NOT NULL);
+ -- a reference to the binary contents of the file
--- The issue table contains auxillary notices per file. For example, when a
--- filename ends in ".gz", but is not gzip decompressible (has no gzip_sha512
--- hash), it is recorded here.
-CREATE TABLE issue (
- cid INTEGER NOT NULL REFERENCES content(id) ON DELETE CASCADE,
- issue TEXT NOT NULL); -- a human readable comment on the file
+CREATE INDEX control_name_index ON control(name);