diff options
Diffstat (limited to 'schema.sql')
-rw-r--r-- | schema.sql | 81 |
1 files changed, 71 insertions, 10 deletions
@@ -1,20 +1,81 @@ -CREATE TABLE package (id INTEGER PRIMARY KEY, name TEXT, version TEXT, architecture TEXT, source TEXT, multiarch TEXT, UNIQUE (name, architecture)); -CREATE TABLE content (id INTEGER PRIMARY KEY, pid INTEGER, filename TEXT, size INTEGER, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE); -CREATE TABLE function (id INTEGER PRIMARY KEY, name TEXT UNIQUE NOT NULL, eqclass INTEGER); -INSERT INTO function (name, eqclass) VALUES ("sha512", 1), ("gzip_sha512", 1), ("png_sha512", 2), ("gif_sha512", 2); -CREATE TABLE hash (cid INTEGER, fid INTEGER NOT NULL, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE, FOREIGN KEY (fid) REFERENCES function(id)); -CREATE TABLE dependency (pid INTEGER, required TEXT, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE); +CREATE TABLE package ( -- binary Debian packages + id INTEGER PRIMARY KEY, + name TEXT NOT NULL, + -- binary package name + version TEXT, -- Debian version + architecture TEXT, + source TEXT, -- name of the source package it was built from + multiarch TEXT, -- value of the Multi-Arch field + UNIQUE (name, architecture)); + +CREATE TABLE content ( -- a file contained in a binary package + id INTEGER PRIMARY KEY, + pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, + -- which package the file is contained in + filename TEXT NOT NULL, + size INTEGER NOT NULL); + +CREATE TABLE function ( -- hash functions + id INTEGER PRIMARY KEY, + name TEXT UNIQUE NOT NULL, + eqclass INTEGER); + -- hash values of different hash functions are comparable if they share + -- an eqclass + +INSERT INTO function (id, name, eqclass) VALUES + (1, 'sha512', 1), + (2, 'gzip_sha512', 1), + -- decompress a gzip file, then hash + (3, 'png_sha512', 2), + -- decompress a PNG file, hash RGBA image contents + dimension + (4, 'gif_sha512', 2); + -- decompress a GIF file, hash RGBA image contents + dimension + +CREATE TABLE hash ( -- hash values of files in binary packages + cid INTEGER NOT NULL REFERENCES content(id) ON DELETE CASCADE, + -- which file has been hashed + fid INTEGER NOT NULL REFERENCES function(id), + -- using which function + hash TEXT NOT NULL); -- textual hash value + +CREATE TABLE dependency ( -- binary package dependencies + pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, + -- the package that carries a Depends: header + required TEXT NOT NULL); + -- the name of a package that is depended upon without alternative + CREATE INDEX content_package_size_index ON content (pid, size); CREATE INDEX hash_cid_index ON hash (cid); CREATE INDEX hash_hash_index ON hash (hash); +-- All tables below this line can be recomputed from the tables above. +-- Recomputation is done using the update_sharing.py script. + +-- The sharing table caches two values for each pair of packages pid1, pid2 and +-- pair of hash functions fid1, fid2: +-- * files is the number of files in pid1 that could be eliminated by reusing +-- files from pid2. Since the functions may be different, this may mean +-- replacing a compressed file with an uncompressed one. +-- * size is the number of bytes that would be freed by doing the above. +-- Note: If pid1=pid2, one copy of each file must be preserved. CREATE TABLE sharing ( pid1 INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, pid2 INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, fid1 INTEGER NOT NULL REFERENCES function(id), fid2 INTEGER NOT NULL REFERENCES function(id), - files INTEGER, - size INTEGER); + files INTEGER NOT NULL, + size INTEGER NOT NULL); CREATE INDEX sharing_insert_index ON sharing (pid1, pid2, fid1, fid2); -CREATE TABLE duplicate (cid INTEGER PRIMARY KEY, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE); -CREATE TABLE issue (cid INTEGER REFERENCES content(id) ON DELETE CASCADE, issue TEXT); + +-- The duplicate table caches all files that have any non-unique hash value. +-- It is used in webapp.py to speed up one query, but could be dropped +-- otherwise. +CREATE TABLE duplicate ( + cid INTEGER PRIMARY KEY REFERENCES content(id) ON DELETE CASCADE); + +-- The issue table contains auxillary notices per file. For example, when a +-- filename ends in ".gz", but is not gzip decompressible (has no gzip_sha512 +-- hash), it is recorded here. +CREATE TABLE issue ( + cid INTEGER NOT NULL REFERENCES content(id) ON DELETE CASCADE, + issue TEXT NOT NULL); -- a human readable comment on the file |