From bb0aea9971bc79d8787d8f034022d0ca803fcab3 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sat, 8 Mar 2014 09:48:17 +0100 Subject: schema: make syntax compatible with postgres --- schema.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'schema.sql') diff --git a/schema.sql b/schema.sql index 2ab7ca7..f6b1a7c 100644 --- a/schema.sql +++ b/schema.sql @@ -1,7 +1,7 @@ CREATE TABLE package (id INTEGER PRIMARY KEY, name TEXT UNIQUE, version TEXT, architecture TEXT, source TEXT); CREATE TABLE content (id INTEGER PRIMARY KEY, pid INTEGER, filename TEXT, size INTEGER, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE); CREATE TABLE function (id INTEGER PRIMARY KEY, name TEXT UNIQUE NOT NULL, eqclass INTEGER); -INSERT INTO function (name, eqclass) VALUES ("sha512", 1), ("gzip_sha512", 1), ("png_sha512", 2), ("gif_sha512", 2); +INSERT INTO function (id, name, eqclass) VALUES (1, 'sha512', 1), (2, 'gzip_sha512', 1), (3, 'png_sha512', 2), (4, 'gif_sha512', 2); CREATE TABLE hash (cid INTEGER, fid INTEGER NOT NULL, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE, FOREIGN KEY (fid) REFERENCES function(id)); CREATE TABLE dependency (pid INTEGER, required TEXT, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE); CREATE INDEX content_package_size_index ON content (pid, size); -- cgit v1.2.3 From bd1a5eb04cf1cb3dc69537c8cdda6130342223e6 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sat, 14 Jun 2014 10:19:55 +0200 Subject: add documentation to schema.sql Thanks to Peter Palfrader for explaining what information is needed and reviewing the documentation. --- schema.sql | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 8 deletions(-) (limited to 'schema.sql') diff --git a/schema.sql b/schema.sql index f6b1a7c..35f82ca 100644 --- a/schema.sql +++ b/schema.sql @@ -1,13 +1,48 @@ -CREATE TABLE package (id INTEGER PRIMARY KEY, name TEXT UNIQUE, version TEXT, architecture TEXT, source TEXT); -CREATE TABLE content (id INTEGER PRIMARY KEY, pid INTEGER, filename TEXT, size INTEGER, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE); -CREATE TABLE function (id INTEGER PRIMARY KEY, name TEXT UNIQUE NOT NULL, eqclass INTEGER); -INSERT INTO function (id, name, eqclass) VALUES (1, 'sha512', 1), (2, 'gzip_sha512', 1), (3, 'png_sha512', 2), (4, 'gif_sha512', 2); -CREATE TABLE hash (cid INTEGER, fid INTEGER NOT NULL, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE, FOREIGN KEY (fid) REFERENCES function(id)); -CREATE TABLE dependency (pid INTEGER, required TEXT, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE); +CREATE TABLE package ( -- binary Debian packages. + id INTEGER PRIMARY KEY, + name TEXT UNIQUE, -- binary package name + version TEXT, -- Debian version + architecture TEXT, + source TEXT); -- name of the source package it was built from +CREATE TABLE content ( -- a file contained in a binary package + id INTEGER PRIMARY KEY, + pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, + -- which package the file is contained in + filename TEXT, + size INTEGER); +CREATE TABLE function ( -- hash functions + id INTEGER PRIMARY KEY, + name TEXT UNIQUE NOT NULL, + eqclass INTEGER); -- hash values of different hash functions are comparable if they share an eqclass +INSERT INTO function (id, name, eqclass) VALUES + (1, 'sha512', 1), + (2, 'gzip_sha512', 1), + (3, 'png_sha512', 2), + (4, 'gif_sha512', 2); +CREATE TABLE hash ( -- hash values of files in binary packages + cid INTEGER NOT NULL REFERENCES content(id) ON DELETE CASCADE, + -- which file has been hashed + fid INTEGER NOT NULL REFERENCES function(id), + -- using which function + hash TEXT); -- textual hash value +CREATE TABLE dependency ( -- binary package dependencies + pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, + -- the package carries a Depends: header + required TEXT); -- the name of a package that is dependet upon without alternative CREATE INDEX content_package_size_index ON content (pid, size); CREATE INDEX hash_cid_index ON hash (cid); CREATE INDEX hash_hash_index ON hash (hash); +-- All tables below this line can be recomputed from the tables above. +-- Recomputation is done using the update_sharing.py script. + +-- The sharing table caches two values for each pair of packages pid1, pid2 and +-- pair of hash functions fid1, fid2: +-- * files is the number of files in pid1 that could be eliminated by reusing +-- files from pid2. Since the functions may be different, this may mean +-- replacing a compressed file with an uncompressed one. +-- * size is the number of bytes that would be freed by doing the above. +-- Note: If pid1=pid2, one copy of each file must be preserved. CREATE TABLE sharing ( pid1 INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, pid2 INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, @@ -16,5 +51,12 @@ CREATE TABLE sharing ( files INTEGER, size INTEGER); CREATE INDEX sharing_insert_index ON sharing (pid1, pid2, fid1, fid2); -CREATE TABLE duplicate (cid INTEGER PRIMARY KEY, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE); -CREATE TABLE issue (cid INTEGER REFERENCES content(id) ON DELETE CASCADE, issue TEXT); +-- The duplicate table caches all files that have any non-unique hash value. +CREATE TABLE duplicate ( + cid INTEGER PRIMARY KEY REFERENCES content(id) ON DELETE CASCADE); +-- The issue table contains auxillary notices per file. For example, when a +-- filename ends in ".gz", but is not gzip decompressible (has no gzip_sha512 +-- hash), it is recorded here. +CREATE TABLE issue ( + cid INTEGER REFERENCES content(id) ON DELETE CASCADE, + issue TEXT); -- cgit v1.2.3 From d48c3c208ee6ba54225b3eb68ce5c9f3c894bfa4 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sat, 14 Jun 2014 12:08:09 +0200 Subject: improve schema documentation wording, more NOT NULLs, some more explanations --- schema.sql | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) (limited to 'schema.sql') diff --git a/schema.sql b/schema.sql index 35f82ca..99ae7e5 100644 --- a/schema.sql +++ b/schema.sql @@ -1,34 +1,47 @@ -CREATE TABLE package ( -- binary Debian packages. +CREATE TABLE package ( -- binary Debian packages id INTEGER PRIMARY KEY, - name TEXT UNIQUE, -- binary package name + name TEXT UNIQUE NOT NULL, + -- binary package name version TEXT, -- Debian version architecture TEXT, source TEXT); -- name of the source package it was built from + CREATE TABLE content ( -- a file contained in a binary package id INTEGER PRIMARY KEY, pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, - -- which package the file is contained in - filename TEXT, - size INTEGER); + -- which package the file is contained in + filename TEXT NOT NULL, + size INTEGER NOT NULL); + CREATE TABLE function ( -- hash functions id INTEGER PRIMARY KEY, name TEXT UNIQUE NOT NULL, - eqclass INTEGER); -- hash values of different hash functions are comparable if they share an eqclass + eqclass INTEGER); + -- hash values of different hash functions are comparable if they share + -- an eqclass + INSERT INTO function (id, name, eqclass) VALUES (1, 'sha512', 1), (2, 'gzip_sha512', 1), + -- decompress a gzip file, then hash (3, 'png_sha512', 2), + -- decompress a PNG file, hash RGBA image contents + dimension (4, 'gif_sha512', 2); + -- decompress a GIF file, hash RGBA image contents + dimension + CREATE TABLE hash ( -- hash values of files in binary packages cid INTEGER NOT NULL REFERENCES content(id) ON DELETE CASCADE, - -- which file has been hashed + -- which file has been hashed fid INTEGER NOT NULL REFERENCES function(id), - -- using which function - hash TEXT); -- textual hash value + -- using which function + hash TEXT NOT NULL); -- textual hash value + CREATE TABLE dependency ( -- binary package dependencies pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, - -- the package carries a Depends: header - required TEXT); -- the name of a package that is dependet upon without alternative + -- the package that carries a Depends: header + required TEXT NOT NULL); + -- the name of a package that is depended upon without alternative + CREATE INDEX content_package_size_index ON content (pid, size); CREATE INDEX hash_cid_index ON hash (cid); CREATE INDEX hash_hash_index ON hash (hash); @@ -48,15 +61,19 @@ CREATE TABLE sharing ( pid2 INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, fid1 INTEGER NOT NULL REFERENCES function(id), fid2 INTEGER NOT NULL REFERENCES function(id), - files INTEGER, - size INTEGER); + files INTEGER NOT NULL, + size INTEGER NOT NULL); CREATE INDEX sharing_insert_index ON sharing (pid1, pid2, fid1, fid2); + -- The duplicate table caches all files that have any non-unique hash value. +-- It is used in webapp.py to speed up one query, but could be dropped +-- otherwise. CREATE TABLE duplicate ( cid INTEGER PRIMARY KEY REFERENCES content(id) ON DELETE CASCADE); + -- The issue table contains auxillary notices per file. For example, when a -- filename ends in ".gz", but is not gzip decompressible (has no gzip_sha512 -- hash), it is recorded here. CREATE TABLE issue ( - cid INTEGER REFERENCES content(id) ON DELETE CASCADE, - issue TEXT); + cid INTEGER NOT NULL REFERENCES content(id) ON DELETE CASCADE, + issue TEXT NOT NULL); -- a human readable comment on the file -- cgit v1.2.3