From 6f88561d726327c90f83b8aad1db26abbd4cdf1e Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Tue, 23 Jul 2013 18:53:55 +0200 Subject: schema: reference hash functions by integer key This already worked quite well for package.id. On a test data set of 5% size this transformation reduces the database size by about 4%. --- readyaml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'readyaml.py') diff --git a/readyaml.py b/readyaml.py index bb8ac54..f4d6ead 100755 --- a/readyaml.py +++ b/readyaml.py @@ -45,7 +45,7 @@ def readyaml(db, stream): cur.execute("INSERT INTO content (pid, filename, size) VALUES (?, ?, ?);", (pid, entry["name"], entry["size"])) cid = cur.lastrowid - cur.executemany("INSERT INTO hash (cid, function, hash) VALUES (?, ?, ?);", + cur.executemany("INSERT INTO hash (cid, fid, hash) VALUES (?, (SELECT id FROM function WHERE name = ?), ?);", ((cid, func, hexhash) for func, hexhash in entry["hashes"].items())) raise ValueError("missing commit block") -- cgit v1.2.3 From 32f406706c0a2a21b11656e5c56ff203e0ee3799 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 24 Jul 2013 07:20:19 +0200 Subject: readyaml: cache the whole function table This should reduce the query bandwidth to the rdbms. --- readyaml.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'readyaml.py') diff --git a/readyaml.py b/readyaml.py index f4d6ead..21b1ca1 100755 --- a/readyaml.py +++ b/readyaml.py @@ -25,6 +25,8 @@ def readyaml(db, stream): pid = None cur.execute("BEGIN;") + cur.execute("SELECT name, id FROM function;") + funcmapping = dict(cur.fetchall()) if pid is not None: cur.execute("DELETE FROM content WHERE pid = ?;", (pid,)) cur.execute("DELETE FROM dependency WHERE pid = ?;", (pid,)) @@ -45,8 +47,8 @@ def readyaml(db, stream): cur.execute("INSERT INTO content (pid, filename, size) VALUES (?, ?, ?);", (pid, entry["name"], entry["size"])) cid = cur.lastrowid - cur.executemany("INSERT INTO hash (cid, fid, hash) VALUES (?, (SELECT id FROM function WHERE name = ?), ?);", - ((cid, func, hexhash) + cur.executemany("INSERT INTO hash (cid, fid, hash) VALUES (?, ?, ?);", + ((cid, funcmapping[func], hexhash) for func, hexhash in entry["hashes"].items())) raise ValueError("missing commit block") -- cgit v1.2.3