diff options
-rwxr-xr-x | importpkg.py | 8 | ||||
-rwxr-xr-x | readyaml.py | 5 | ||||
-rw-r--r-- | schema.sql | 2 |
3 files changed, 8 insertions, 7 deletions
diff --git a/importpkg.py b/importpkg.py index 06d9da4..8f2390c 100755 --- a/importpkg.py +++ b/importpkg.py @@ -82,11 +82,9 @@ def process_package(filelike, hash_functions): state = "data" tf = decompress_tar(af, name[8:]) for name, size, hashes in get_tar_hashes(tf, hash_functions): - try: - name = name.decode("utf8") - except UnicodeDecodeError: - print("warning: skipping filename with encoding error") - continue # skip files with non-utf8 encoding for now + # filenames are not actually iso-8859-1, but this decode + # cannot cause UnicodeDecodeError + name = name.decode("iso-8859-1") yield dict(name=name, size=size, hashes=hashes) yield "commit" break diff --git a/readyaml.py b/readyaml.py index 2ef9a3b..3e5ba87 100755 --- a/readyaml.py +++ b/readyaml.py @@ -45,8 +45,11 @@ def readyaml(db, stream): db.commit() return + # iso-8859-1 is used as a safe representation of binary data within + # utf-8. + filename = buffer(entry["name"].encode("iso-8859-1")) cur.execute("INSERT INTO content (pid, filename, size) VALUES (?, ?, ?);", - (pid, entry["name"], entry["size"])) + (pid, filename, entry["size"])) cid = cur.lastrowid cur.executemany("INSERT INTO hash (cid, fid, hash) VALUES (?, ?, ?);", ((cid, funcmapping[func], hexhash) @@ -10,7 +10,7 @@ CREATE TABLE content ( -- a file contained in a binary package id INTEGER PRIMARY KEY, pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, -- which package the file is contained in - filename TEXT NOT NULL, + filename BLOB NOT NULL, size INTEGER NOT NULL); CREATE TABLE function ( -- hash functions |