summaryrefslogtreecommitdiff
path: root/schema.sql
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2014-12-12 13:28:02 +0100
committerHelmut Grohne <helmut@subdivi.de>2014-12-12 13:28:02 +0100
commit36fd6fc4bd1c6930c77aa5b6408a832c1e651ef6 (patch)
treeb1a9fcedb51aee454f42e5d1afa8b94db0ccafea /schema.sql
parent2a728ab85e1ddfeec03514f86f706c116ca94440 (diff)
downloaddebian-dedup-36fd6fc4bd1c6930c77aa5b6408a832c1e651ef6.tar.gz
full text searching on control.tar memberscontroldata
This is a rather strange variant that has nothing to do with deduplication anymore. Instead, it enables searching a sqlite fts4 table containing all members of control.tars.
Diffstat (limited to 'schema.sql')
-rw-r--r--schema.sql72
1 files changed, 9 insertions, 63 deletions
diff --git a/schema.sql b/schema.sql
index 99ae7e5..b23b8e6 100644
--- a/schema.sql
+++ b/schema.sql
@@ -6,74 +6,20 @@ CREATE TABLE package ( -- binary Debian packages
architecture TEXT,
source TEXT); -- name of the source package it was built from
-CREATE TABLE content ( -- a file contained in a binary package
- id INTEGER PRIMARY KEY,
- pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
- -- which package the file is contained in
- filename TEXT NOT NULL,
- size INTEGER NOT NULL);
-
-CREATE TABLE function ( -- hash functions
- id INTEGER PRIMARY KEY,
- name TEXT UNIQUE NOT NULL,
- eqclass INTEGER);
- -- hash values of different hash functions are comparable if they share
- -- an eqclass
-
-INSERT INTO function (id, name, eqclass) VALUES
- (1, 'sha512', 1),
- (2, 'gzip_sha512', 1),
- -- decompress a gzip file, then hash
- (3, 'png_sha512', 2),
- -- decompress a PNG file, hash RGBA image contents + dimension
- (4, 'gif_sha512', 2);
- -- decompress a GIF file, hash RGBA image contents + dimension
-
-CREATE TABLE hash ( -- hash values of files in binary packages
- cid INTEGER NOT NULL REFERENCES content(id) ON DELETE CASCADE,
- -- which file has been hashed
- fid INTEGER NOT NULL REFERENCES function(id),
- -- using which function
- hash TEXT NOT NULL); -- textual hash value
-
CREATE TABLE dependency ( -- binary package dependencies
pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
-- the package that carries a Depends: header
required TEXT NOT NULL);
-- the name of a package that is depended upon without alternative
-CREATE INDEX content_package_size_index ON content (pid, size);
-CREATE INDEX hash_cid_index ON hash (cid);
-CREATE INDEX hash_hash_index ON hash (hash);
-
--- All tables below this line can be recomputed from the tables above.
--- Recomputation is done using the update_sharing.py script.
+CREATE VIRTUAL TABLE controlcontent USING fts4(content);
--- The sharing table caches two values for each pair of packages pid1, pid2 and
--- pair of hash functions fid1, fid2:
--- * files is the number of files in pid1 that could be eliminated by reusing
--- files from pid2. Since the functions may be different, this may mean
--- replacing a compressed file with an uncompressed one.
--- * size is the number of bytes that would be freed by doing the above.
--- Note: If pid1=pid2, one copy of each file must be preserved.
-CREATE TABLE sharing (
- pid1 INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
- pid2 INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
- fid1 INTEGER NOT NULL REFERENCES function(id),
- fid2 INTEGER NOT NULL REFERENCES function(id),
- files INTEGER NOT NULL,
- size INTEGER NOT NULL);
-CREATE INDEX sharing_insert_index ON sharing (pid1, pid2, fid1, fid2);
-
--- The duplicate table caches all files that have any non-unique hash value.
--- It is used in webapp.py to speed up one query, but could be dropped
--- otherwise.
-CREATE TABLE duplicate (
- cid INTEGER PRIMARY KEY REFERENCES content(id) ON DELETE CASCADE);
+CREATE TABLE control ( -- control.tar contents
+ pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
+ -- the package that contains a control.tar member
+ name TEXT NOT NULL,
+ -- the name of the control.tar member without leading "./"
+ cid INTEGER NOT NULL);
+ -- a reference to the binary contents of the file
--- The issue table contains auxillary notices per file. For example, when a
--- filename ends in ".gz", but is not gzip decompressible (has no gzip_sha512
--- hash), it is recorded here.
-CREATE TABLE issue (
- cid INTEGER NOT NULL REFERENCES content(id) ON DELETE CASCADE,
- issue TEXT NOT NULL); -- a human readable comment on the file
+CREATE INDEX control_name_index ON control(name);