diff options
author | Helmut Grohne <helmut@subdivi.de> | 2014-12-12 13:28:02 +0100 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2014-12-12 13:28:02 +0100 |
commit | 36fd6fc4bd1c6930c77aa5b6408a832c1e651ef6 (patch) | |
tree | b1a9fcedb51aee454f42e5d1afa8b94db0ccafea /schema.sql | |
parent | 2a728ab85e1ddfeec03514f86f706c116ca94440 (diff) | |
download | debian-dedup-controldata.tar.gz |
full text searching on control.tar memberscontroldata
This is a rather strange variant that has nothing to do with
deduplication anymore. Instead, it enables searching a sqlite fts4 table
containing all members of control.tars.
Diffstat (limited to 'schema.sql')
-rw-r--r-- | schema.sql | 72 |
1 files changed, 9 insertions, 63 deletions
@@ -6,74 +6,20 @@ CREATE TABLE package ( -- binary Debian packages architecture TEXT, source TEXT); -- name of the source package it was built from -CREATE TABLE content ( -- a file contained in a binary package - id INTEGER PRIMARY KEY, - pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, - -- which package the file is contained in - filename TEXT NOT NULL, - size INTEGER NOT NULL); - -CREATE TABLE function ( -- hash functions - id INTEGER PRIMARY KEY, - name TEXT UNIQUE NOT NULL, - eqclass INTEGER); - -- hash values of different hash functions are comparable if they share - -- an eqclass - -INSERT INTO function (id, name, eqclass) VALUES - (1, 'sha512', 1), - (2, 'gzip_sha512', 1), - -- decompress a gzip file, then hash - (3, 'png_sha512', 2), - -- decompress a PNG file, hash RGBA image contents + dimension - (4, 'gif_sha512', 2); - -- decompress a GIF file, hash RGBA image contents + dimension - -CREATE TABLE hash ( -- hash values of files in binary packages - cid INTEGER NOT NULL REFERENCES content(id) ON DELETE CASCADE, - -- which file has been hashed - fid INTEGER NOT NULL REFERENCES function(id), - -- using which function - hash TEXT NOT NULL); -- textual hash value - CREATE TABLE dependency ( -- binary package dependencies pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, -- the package that carries a Depends: header required TEXT NOT NULL); -- the name of a package that is depended upon without alternative -CREATE INDEX content_package_size_index ON content (pid, size); -CREATE INDEX hash_cid_index ON hash (cid); -CREATE INDEX hash_hash_index ON hash (hash); - --- All tables below this line can be recomputed from the tables above. --- Recomputation is done using the update_sharing.py script. +CREATE VIRTUAL TABLE controlcontent USING fts4(content); --- The sharing table caches two values for each pair of packages pid1, pid2 and --- pair of hash functions fid1, fid2: --- * files is the number of files in pid1 that could be eliminated by reusing --- files from pid2. Since the functions may be different, this may mean --- replacing a compressed file with an uncompressed one. --- * size is the number of bytes that would be freed by doing the above. --- Note: If pid1=pid2, one copy of each file must be preserved. -CREATE TABLE sharing ( - pid1 INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, - pid2 INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, - fid1 INTEGER NOT NULL REFERENCES function(id), - fid2 INTEGER NOT NULL REFERENCES function(id), - files INTEGER NOT NULL, - size INTEGER NOT NULL); -CREATE INDEX sharing_insert_index ON sharing (pid1, pid2, fid1, fid2); - --- The duplicate table caches all files that have any non-unique hash value. --- It is used in webapp.py to speed up one query, but could be dropped --- otherwise. -CREATE TABLE duplicate ( - cid INTEGER PRIMARY KEY REFERENCES content(id) ON DELETE CASCADE); +CREATE TABLE control ( -- control.tar contents + pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, + -- the package that contains a control.tar member + name TEXT NOT NULL, + -- the name of the control.tar member without leading "./" + cid INTEGER NOT NULL); + -- a reference to the binary contents of the file --- The issue table contains auxillary notices per file. For example, when a --- filename ends in ".gz", but is not gzip decompressible (has no gzip_sha512 --- hash), it is recorded here. -CREATE TABLE issue ( - cid INTEGER NOT NULL REFERENCES content(id) ON DELETE CASCADE, - issue TEXT NOT NULL); -- a human readable comment on the file +CREATE INDEX control_name_index ON control(name); |