From 1359895781ec1f7887121984abc46aefc61c6717 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Mon, 10 Jun 2013 18:22:29 +0200 Subject: split the import phase to a yaml stream importpkg.py now emits a yaml stream instead of updating the database. The acutual updating now happens in readyaml.py. In this process autoimport.py was significantly reworked to import packages in parallel. --- readyaml.py | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100755 readyaml.py (limited to 'readyaml.py') diff --git a/readyaml.py b/readyaml.py new file mode 100755 index 0000000..b66c7f3 --- /dev/null +++ b/readyaml.py @@ -0,0 +1,48 @@ +#!/usr/bin/python +"""This tool reads a yaml file as generated by importpkg.py on stdin and +updates the database with the contents.""" + +import sqlite3 +import sys + +from debian.debian_support import version_compare +import yaml + +def main(): + db = sqlite3.connect("test.sqlite3") + cur = db.cursor() + cur.execute("PRAGMA foreign_keys = ON;") + gen = yaml.safe_load_all(sys.stdin) + metadata = next(gen) + package = metadata["package"] + cur.execute("SELECT version FROM package WHERE package = ?;", + (package,)) + row = cur.fetchone() + if row and version_compare(row[0], metadata["version"]) > 0: + return + + cur.execute("BEGIN;") + cur.execute("DELETE FROM content WHERE package = ?;", + (package,)) + cur.execute("INSERT OR REPLACE INTO package (package, version, architecture, source) VALUES (?, ?, ?, ?);", + (package, metadata["version"], metadata["architecture"], + metadata["source"])) + cur.execute("DELETE FROM dependency WHERE package = ?;", + (package,)) + cur.executemany("INSERT INTO dependency (package, required) VALUES (?, ?);", + ((package, dep) for dep in metadata["depends"])) + for entry in gen: + if entry == "commit": + db.commit() + return + + cur.execute("INSERT INTO content (package, filename, size) VALUES (?, ?, ?);", + (package, entry["name"], entry["size"])) + cid = cur.lastrowid + cur.executemany("INSERT INTO hash (cid, function, hash) VALUES (?, ?, ?);", + ((cid, func, hexhash) + for func, hexhash in entry["hashes"].items())) + raise ValueError("missing commit block") + +if __name__ == "__main__": + main() -- cgit v1.2.3 From f652c17f242fb743a167041521e9618039ae7296 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Tue, 11 Jun 2013 23:22:10 +0200 Subject: autoimport: don't fork for readyaml This appears to be a huge performance boost. --- autoimport.py | 8 +++++--- readyaml.py | 9 ++++++--- 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'readyaml.py') diff --git a/autoimport.py b/autoimport.py index 48d3f8c..d326d61 100755 --- a/autoimport.py +++ b/autoimport.py @@ -16,6 +16,8 @@ import concurrent.futures from debian import deb822 from debian.debian_support import version_compare +from readyaml import readyaml + def process_http(pkgs, url): pkglist = urllib.urlopen(url + "/dists/sid/main/binary-amd64/Packages.gz").read() pkglist = gzip.GzipFile(fileobj=io.BytesIO(pkglist)).read() @@ -115,9 +117,9 @@ def main(): print("sqlimporting %s" % name) with open(inf) as inp: try: - subprocess.check_call(["python", "readyaml.py"], stdin=inp) - except subprocess.CalledProcessError: - print("%s failed sql" % name) + readyaml(db, inp) + except Exception as exc: + print("%s failed sql with exception %r" % (name, exc)) else: os.unlink(inf) diff --git a/readyaml.py b/readyaml.py index b66c7f3..e2f3bb3 100755 --- a/readyaml.py +++ b/readyaml.py @@ -8,11 +8,10 @@ import sys from debian.debian_support import version_compare import yaml -def main(): - db = sqlite3.connect("test.sqlite3") +def readyaml(db, stream): cur = db.cursor() cur.execute("PRAGMA foreign_keys = ON;") - gen = yaml.safe_load_all(sys.stdin) + gen = yaml.safe_load_all(stream) metadata = next(gen) package = metadata["package"] cur.execute("SELECT version FROM package WHERE package = ?;", @@ -44,5 +43,9 @@ def main(): for func, hexhash in entry["hashes"].items())) raise ValueError("missing commit block") +def main(): + db = sqlite3.connect("test.sqlite3") + readyaml(db, sys.stdin) + if __name__ == "__main__": main() -- cgit v1.2.3