diff options
author | Helmut Grohne <helmut@subdivi.de> | 2014-06-14 10:19:55 +0200 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2014-06-14 10:19:55 +0200 |
commit | bd1a5eb04cf1cb3dc69537c8cdda6130342223e6 (patch) | |
tree | e30dd3fe6b753f3ed6c7416ef01bc82fa8d553c8 | |
parent | b343a67cde7d37ff01674d2cd399e874ea0aaccc (diff) | |
download | debian-dedup-bd1a5eb04cf1cb3dc69537c8cdda6130342223e6.tar.gz |
add documentation to schema.sql
Thanks to Peter Palfrader for explaining what information is needed and
reviewing the documentation.
-rw-r--r-- | schema.sql | 58 |
1 files changed, 50 insertions, 8 deletions
@@ -1,13 +1,48 @@ -CREATE TABLE package (id INTEGER PRIMARY KEY, name TEXT UNIQUE, version TEXT, architecture TEXT, source TEXT); -CREATE TABLE content (id INTEGER PRIMARY KEY, pid INTEGER, filename TEXT, size INTEGER, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE); -CREATE TABLE function (id INTEGER PRIMARY KEY, name TEXT UNIQUE NOT NULL, eqclass INTEGER); -INSERT INTO function (id, name, eqclass) VALUES (1, 'sha512', 1), (2, 'gzip_sha512', 1), (3, 'png_sha512', 2), (4, 'gif_sha512', 2); -CREATE TABLE hash (cid INTEGER, fid INTEGER NOT NULL, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE, FOREIGN KEY (fid) REFERENCES function(id)); -CREATE TABLE dependency (pid INTEGER, required TEXT, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE); +CREATE TABLE package ( -- binary Debian packages. + id INTEGER PRIMARY KEY, + name TEXT UNIQUE, -- binary package name + version TEXT, -- Debian version + architecture TEXT, + source TEXT); -- name of the source package it was built from +CREATE TABLE content ( -- a file contained in a binary package + id INTEGER PRIMARY KEY, + pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, + -- which package the file is contained in + filename TEXT, + size INTEGER); +CREATE TABLE function ( -- hash functions + id INTEGER PRIMARY KEY, + name TEXT UNIQUE NOT NULL, + eqclass INTEGER); -- hash values of different hash functions are comparable if they share an eqclass +INSERT INTO function (id, name, eqclass) VALUES + (1, 'sha512', 1), + (2, 'gzip_sha512', 1), + (3, 'png_sha512', 2), + (4, 'gif_sha512', 2); +CREATE TABLE hash ( -- hash values of files in binary packages + cid INTEGER NOT NULL REFERENCES content(id) ON DELETE CASCADE, + -- which file has been hashed + fid INTEGER NOT NULL REFERENCES function(id), + -- using which function + hash TEXT); -- textual hash value +CREATE TABLE dependency ( -- binary package dependencies + pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, + -- the package carries a Depends: header + required TEXT); -- the name of a package that is dependet upon without alternative CREATE INDEX content_package_size_index ON content (pid, size); CREATE INDEX hash_cid_index ON hash (cid); CREATE INDEX hash_hash_index ON hash (hash); +-- All tables below this line can be recomputed from the tables above. +-- Recomputation is done using the update_sharing.py script. + +-- The sharing table caches two values for each pair of packages pid1, pid2 and +-- pair of hash functions fid1, fid2: +-- * files is the number of files in pid1 that could be eliminated by reusing +-- files from pid2. Since the functions may be different, this may mean +-- replacing a compressed file with an uncompressed one. +-- * size is the number of bytes that would be freed by doing the above. +-- Note: If pid1=pid2, one copy of each file must be preserved. CREATE TABLE sharing ( pid1 INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, pid2 INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, @@ -16,5 +51,12 @@ CREATE TABLE sharing ( files INTEGER, size INTEGER); CREATE INDEX sharing_insert_index ON sharing (pid1, pid2, fid1, fid2); -CREATE TABLE duplicate (cid INTEGER PRIMARY KEY, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE); -CREATE TABLE issue (cid INTEGER REFERENCES content(id) ON DELETE CASCADE, issue TEXT); +-- The duplicate table caches all files that have any non-unique hash value. +CREATE TABLE duplicate ( + cid INTEGER PRIMARY KEY REFERENCES content(id) ON DELETE CASCADE); +-- The issue table contains auxillary notices per file. For example, when a +-- filename ends in ".gz", but is not gzip decompressible (has no gzip_sha512 +-- hash), it is recorded here. +CREATE TABLE issue ( + cid INTEGER REFERENCES content(id) ON DELETE CASCADE, + issue TEXT); |