summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-ximportpkg.py8
-rwxr-xr-xreadyaml.py5
-rw-r--r--schema.sql2
3 files changed, 8 insertions, 7 deletions
diff --git a/importpkg.py b/importpkg.py
index 06d9da4..8f2390c 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -82,11 +82,9 @@ def process_package(filelike, hash_functions):
state = "data"
tf = decompress_tar(af, name[8:])
for name, size, hashes in get_tar_hashes(tf, hash_functions):
- try:
- name = name.decode("utf8")
- except UnicodeDecodeError:
- print("warning: skipping filename with encoding error")
- continue # skip files with non-utf8 encoding for now
+ # filenames are not actually iso-8859-1, but this decode
+ # cannot cause UnicodeDecodeError
+ name = name.decode("iso-8859-1")
yield dict(name=name, size=size, hashes=hashes)
yield "commit"
break
diff --git a/readyaml.py b/readyaml.py
index 2ef9a3b..3e5ba87 100755
--- a/readyaml.py
+++ b/readyaml.py
@@ -45,8 +45,11 @@ def readyaml(db, stream):
db.commit()
return
+ # iso-8859-1 is used as a safe representation of binary data within
+ # utf-8.
+ filename = buffer(entry["name"].encode("iso-8859-1"))
cur.execute("INSERT INTO content (pid, filename, size) VALUES (?, ?, ?);",
- (pid, entry["name"], entry["size"]))
+ (pid, filename, entry["size"]))
cid = cur.lastrowid
cur.executemany("INSERT INTO hash (cid, fid, hash) VALUES (?, ?, ?);",
((cid, funcmapping[func], hexhash)
diff --git a/schema.sql b/schema.sql
index 99ae7e5..ce36e09 100644
--- a/schema.sql
+++ b/schema.sql
@@ -10,7 +10,7 @@ CREATE TABLE content ( -- a file contained in a binary package
id INTEGER PRIMARY KEY,
pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
-- which package the file is contained in
- filename TEXT NOT NULL,
+ filename BLOB NOT NULL,
size INTEGER NOT NULL);
CREATE TABLE function ( -- hash functions