summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2014-06-14 10:19:55 +0200
committerHelmut Grohne <helmut@subdivi.de>2014-06-14 10:19:55 +0200
commitbd1a5eb04cf1cb3dc69537c8cdda6130342223e6 (patch)
treee30dd3fe6b753f3ed6c7416ef01bc82fa8d553c8
parentb343a67cde7d37ff01674d2cd399e874ea0aaccc (diff)
downloaddebian-dedup-bd1a5eb04cf1cb3dc69537c8cdda6130342223e6.tar.gz
add documentation to schema.sql
Thanks to Peter Palfrader for explaining what information is needed and reviewing the documentation.
-rw-r--r--schema.sql58
1 files changed, 50 insertions, 8 deletions
diff --git a/schema.sql b/schema.sql
index f6b1a7c..35f82ca 100644
--- a/schema.sql
+++ b/schema.sql
@@ -1,13 +1,48 @@
-CREATE TABLE package (id INTEGER PRIMARY KEY, name TEXT UNIQUE, version TEXT, architecture TEXT, source TEXT);
-CREATE TABLE content (id INTEGER PRIMARY KEY, pid INTEGER, filename TEXT, size INTEGER, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE);
-CREATE TABLE function (id INTEGER PRIMARY KEY, name TEXT UNIQUE NOT NULL, eqclass INTEGER);
-INSERT INTO function (id, name, eqclass) VALUES (1, 'sha512', 1), (2, 'gzip_sha512', 1), (3, 'png_sha512', 2), (4, 'gif_sha512', 2);
-CREATE TABLE hash (cid INTEGER, fid INTEGER NOT NULL, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE, FOREIGN KEY (fid) REFERENCES function(id));
-CREATE TABLE dependency (pid INTEGER, required TEXT, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE);
+CREATE TABLE package ( -- binary Debian packages.
+ id INTEGER PRIMARY KEY,
+ name TEXT UNIQUE, -- binary package name
+ version TEXT, -- Debian version
+ architecture TEXT,
+ source TEXT); -- name of the source package it was built from
+CREATE TABLE content ( -- a file contained in a binary package
+ id INTEGER PRIMARY KEY,
+ pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
+ -- which package the file is contained in
+ filename TEXT,
+ size INTEGER);
+CREATE TABLE function ( -- hash functions
+ id INTEGER PRIMARY KEY,
+ name TEXT UNIQUE NOT NULL,
+ eqclass INTEGER); -- hash values of different hash functions are comparable if they share an eqclass
+INSERT INTO function (id, name, eqclass) VALUES
+ (1, 'sha512', 1),
+ (2, 'gzip_sha512', 1),
+ (3, 'png_sha512', 2),
+ (4, 'gif_sha512', 2);
+CREATE TABLE hash ( -- hash values of files in binary packages
+ cid INTEGER NOT NULL REFERENCES content(id) ON DELETE CASCADE,
+ -- which file has been hashed
+ fid INTEGER NOT NULL REFERENCES function(id),
+ -- using which function
+ hash TEXT); -- textual hash value
+CREATE TABLE dependency ( -- binary package dependencies
+ pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
+ -- the package carries a Depends: header
+ required TEXT); -- the name of a package that is dependet upon without alternative
CREATE INDEX content_package_size_index ON content (pid, size);
CREATE INDEX hash_cid_index ON hash (cid);
CREATE INDEX hash_hash_index ON hash (hash);
+-- All tables below this line can be recomputed from the tables above.
+-- Recomputation is done using the update_sharing.py script.
+
+-- The sharing table caches two values for each pair of packages pid1, pid2 and
+-- pair of hash functions fid1, fid2:
+-- * files is the number of files in pid1 that could be eliminated by reusing
+-- files from pid2. Since the functions may be different, this may mean
+-- replacing a compressed file with an uncompressed one.
+-- * size is the number of bytes that would be freed by doing the above.
+-- Note: If pid1=pid2, one copy of each file must be preserved.
CREATE TABLE sharing (
pid1 INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
pid2 INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
@@ -16,5 +51,12 @@ CREATE TABLE sharing (
files INTEGER,
size INTEGER);
CREATE INDEX sharing_insert_index ON sharing (pid1, pid2, fid1, fid2);
-CREATE TABLE duplicate (cid INTEGER PRIMARY KEY, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE);
-CREATE TABLE issue (cid INTEGER REFERENCES content(id) ON DELETE CASCADE, issue TEXT);
+-- The duplicate table caches all files that have any non-unique hash value.
+CREATE TABLE duplicate (
+ cid INTEGER PRIMARY KEY REFERENCES content(id) ON DELETE CASCADE);
+-- The issue table contains auxillary notices per file. For example, when a
+-- filename ends in ".gz", but is not gzip decompressible (has no gzip_sha512
+-- hash), it is recorded here.
+CREATE TABLE issue (
+ cid INTEGER REFERENCES content(id) ON DELETE CASCADE,
+ issue TEXT);