summaryrefslogtreecommitdiff
path: root/schema.sql
diff options
context:
space:
mode:
Diffstat (limited to 'schema.sql')
-rw-r--r--schema.sql81
1 files changed, 71 insertions, 10 deletions
diff --git a/schema.sql b/schema.sql
index 30f6109..3a49daf 100644
--- a/schema.sql
+++ b/schema.sql
@@ -1,20 +1,81 @@
-CREATE TABLE package (id INTEGER PRIMARY KEY, name TEXT, version TEXT, architecture TEXT, source TEXT, multiarch TEXT, UNIQUE (name, architecture));
-CREATE TABLE content (id INTEGER PRIMARY KEY, pid INTEGER, filename TEXT, size INTEGER, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE);
-CREATE TABLE function (id INTEGER PRIMARY KEY, name TEXT UNIQUE NOT NULL, eqclass INTEGER);
-INSERT INTO function (name, eqclass) VALUES ("sha512", 1), ("gzip_sha512", 1), ("png_sha512", 2), ("gif_sha512", 2);
-CREATE TABLE hash (cid INTEGER, fid INTEGER NOT NULL, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE, FOREIGN KEY (fid) REFERENCES function(id));
-CREATE TABLE dependency (pid INTEGER, required TEXT, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE);
+CREATE TABLE package ( -- binary Debian packages
+ id INTEGER PRIMARY KEY,
+ name TEXT NOT NULL,
+ -- binary package name
+ version TEXT, -- Debian version
+ architecture TEXT,
+ source TEXT, -- name of the source package it was built from
+ multiarch TEXT, -- value of the Multi-Arch field
+ UNIQUE (name, architecture));
+
+CREATE TABLE content ( -- a file contained in a binary package
+ id INTEGER PRIMARY KEY,
+ pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
+ -- which package the file is contained in
+ filename TEXT NOT NULL,
+ size INTEGER NOT NULL);
+
+CREATE TABLE function ( -- hash functions
+ id INTEGER PRIMARY KEY,
+ name TEXT UNIQUE NOT NULL,
+ eqclass INTEGER);
+ -- hash values of different hash functions are comparable if they share
+ -- an eqclass
+
+INSERT INTO function (id, name, eqclass) VALUES
+ (1, 'sha512', 1),
+ (2, 'gzip_sha512', 1),
+ -- decompress a gzip file, then hash
+ (3, 'png_sha512', 2),
+ -- decompress a PNG file, hash RGBA image contents + dimension
+ (4, 'gif_sha512', 2);
+ -- decompress a GIF file, hash RGBA image contents + dimension
+
+CREATE TABLE hash ( -- hash values of files in binary packages
+ cid INTEGER NOT NULL REFERENCES content(id) ON DELETE CASCADE,
+ -- which file has been hashed
+ fid INTEGER NOT NULL REFERENCES function(id),
+ -- using which function
+ hash TEXT NOT NULL); -- textual hash value
+
+CREATE TABLE dependency ( -- binary package dependencies
+ pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
+ -- the package that carries a Depends: header
+ required TEXT NOT NULL);
+ -- the name of a package that is depended upon without alternative
+
CREATE INDEX content_package_size_index ON content (pid, size);
CREATE INDEX hash_cid_index ON hash (cid);
CREATE INDEX hash_hash_index ON hash (hash);
+-- All tables below this line can be recomputed from the tables above.
+-- Recomputation is done using the update_sharing.py script.
+
+-- The sharing table caches two values for each pair of packages pid1, pid2 and
+-- pair of hash functions fid1, fid2:
+-- * files is the number of files in pid1 that could be eliminated by reusing
+-- files from pid2. Since the functions may be different, this may mean
+-- replacing a compressed file with an uncompressed one.
+-- * size is the number of bytes that would be freed by doing the above.
+-- Note: If pid1=pid2, one copy of each file must be preserved.
CREATE TABLE sharing (
pid1 INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
pid2 INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
fid1 INTEGER NOT NULL REFERENCES function(id),
fid2 INTEGER NOT NULL REFERENCES function(id),
- files INTEGER,
- size INTEGER);
+ files INTEGER NOT NULL,
+ size INTEGER NOT NULL);
CREATE INDEX sharing_insert_index ON sharing (pid1, pid2, fid1, fid2);
-CREATE TABLE duplicate (cid INTEGER PRIMARY KEY, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE);
-CREATE TABLE issue (cid INTEGER REFERENCES content(id) ON DELETE CASCADE, issue TEXT);
+
+-- The duplicate table caches all files that have any non-unique hash value.
+-- It is used in webapp.py to speed up one query, but could be dropped
+-- otherwise.
+CREATE TABLE duplicate (
+ cid INTEGER PRIMARY KEY REFERENCES content(id) ON DELETE CASCADE);
+
+-- The issue table contains auxillary notices per file. For example, when a
+-- filename ends in ".gz", but is not gzip decompressible (has no gzip_sha512
+-- hash), it is recorded here.
+CREATE TABLE issue (
+ cid INTEGER NOT NULL REFERENCES content(id) ON DELETE CASCADE,
+ issue TEXT NOT NULL); -- a human readable comment on the file