summaryrefslogtreecommitdiff
path: root/schema.sql
blob: ce36e09bf9d6108faaa4fcd208f8902071153adc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
CREATE TABLE package (		-- binary Debian packages
	id INTEGER PRIMARY KEY,
	name TEXT UNIQUE NOT NULL,
	-- binary package name
	version TEXT,		-- Debian version
	architecture TEXT,
	source TEXT);		-- name of the source package it was built from

CREATE TABLE content (		-- a file contained in a binary package
	id INTEGER PRIMARY KEY,
	pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
	-- which package the file is contained in
	filename BLOB NOT NULL,
	size INTEGER NOT NULL);

CREATE TABLE function (			-- hash functions
	id INTEGER PRIMARY KEY,
	name TEXT UNIQUE NOT NULL,
	eqclass INTEGER);
	-- hash values of different hash functions are comparable if they share
	-- an eqclass

INSERT INTO function (id, name, eqclass) VALUES
	(1, 'sha512', 1),
	(2, 'gzip_sha512', 1),
	-- decompress a gzip file, then hash
	(3, 'png_sha512', 2),
	-- decompress a PNG file, hash RGBA image contents + dimension
	(4, 'gif_sha512', 2);
	-- decompress a GIF file, hash RGBA image contents + dimension

CREATE TABLE hash (		-- hash values of files in binary packages
	cid INTEGER NOT NULL REFERENCES content(id) ON DELETE CASCADE,
	-- which file has been hashed
	fid INTEGER NOT NULL REFERENCES function(id),
	-- using which function
	hash TEXT NOT NULL);	-- textual hash value

CREATE TABLE dependency (	-- binary package dependencies
	pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
	-- the package that carries a Depends: header
	required TEXT NOT NULL);
	-- the name of a package that is depended upon without alternative

CREATE INDEX content_package_size_index ON content (pid, size);
CREATE INDEX hash_cid_index ON hash (cid);
CREATE INDEX hash_hash_index ON hash (hash);

-- All tables below this line can be recomputed from the tables above.
-- Recomputation is done using the update_sharing.py script.

-- The sharing table caches two values for each pair of packages pid1, pid2 and
-- pair of hash functions fid1, fid2:
--  * files is the number of files in pid1 that could be eliminated by reusing
--    files from pid2. Since the functions may be different, this may mean
--    replacing a compressed file with an uncompressed one.
--  * size is the number of bytes that would be freed by doing the above.
-- Note: If pid1=pid2, one copy of each file must be preserved.
CREATE TABLE sharing (
	pid1 INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
	pid2 INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
	fid1 INTEGER NOT NULL REFERENCES function(id),
	fid2 INTEGER NOT NULL REFERENCES function(id),
	files INTEGER NOT NULL,
	size INTEGER NOT NULL);
CREATE INDEX sharing_insert_index ON sharing (pid1, pid2, fid1, fid2);

-- The duplicate table caches all files that have any non-unique hash value.
-- It is used in webapp.py to speed up one query, but could be dropped
-- otherwise.
CREATE TABLE duplicate (
	cid INTEGER PRIMARY KEY REFERENCES content(id) ON DELETE CASCADE);

-- The issue table contains auxillary notices per file. For example, when a
-- filename ends in ".gz", but is not gzip decompressible (has no gzip_sha512
-- hash), it is recorded here.
CREATE TABLE issue (
	cid INTEGER NOT NULL REFERENCES content(id) ON DELETE CASCADE,
	issue TEXT NOT NULL);		-- a human readable comment on the file