summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2021-12-31 15:45:33 +0100
committerHelmut Grohne <helmut@subdivi.de>2021-12-31 15:45:33 +0100
commitf3ea68482e6c01053cb202573d953e8a2e89529f (patch)
tree4c08f6e5a99bbe5131c0949e7f97cc44cf4a2cbd
parentf2eda3ba74e5bc5613e84381ebd8bfd343e1c8cc (diff)
parent5b359b10053cbade539246eec26e86b44793ca40 (diff)
downloaddebian-dedup-f3ea68482e6c01053cb202573d953e8a2e89529f.tar.gz
Merge branch master into branch multiarchhints
Among other things, this drops Python 2.x support.
-rw-r--r--README2
-rwxr-xr-xautoimport.py85
-rw-r--r--dedup/arreader.py79
-rw-r--r--dedup/compression.py46
-rw-r--r--dedup/debpkg.py61
-rw-r--r--dedup/filemagic.py49
-rw-r--r--dedup/hashing.py28
-rw-r--r--dedup/image.py8
-rw-r--r--dedup/templates/base.html2
-rw-r--r--dedup/utils.py36
-rwxr-xr-ximportpkg.py38
-rwxr-xr-xreadyaml.py2
-rwxr-xr-xupdate_sharing.py2
-rwxr-xr-xwebapp.py125
14 files changed, 243 insertions, 320 deletions
diff --git a/README b/README
index 5329bd8..db7fde4 100644
--- a/README
+++ b/README
@@ -1,7 +1,7 @@
Required packages
-----------------
- aptitude install python python-debian python-lzma python-jinja2 python-werkzeug sqlite3 python-imaging python-yaml python-concurrent.futures python-pkg-resources
+ aptitude install python3 python3-arpy python3-debian python3-jinja2 python3-werkzeug sqlite3 python3-pil python3-yaml python3-pkg-resources
Create a database
-----------------
diff --git a/autoimport.py b/autoimport.py
index c00dc4d..0f518c6 100755
--- a/autoimport.py
+++ b/autoimport.py
@@ -1,61 +1,52 @@
-#!/usr/bin/python
+#!/usr/bin/python3
"""This scrip takes a directory or a http base url to a mirror and imports all
packages contained. It has rather strong assumptions on the working directory.
"""
import argparse
-import contextlib
import errno
import multiprocessing
-import os
+import pathlib
import sqlite3
import subprocess
import sys
import tempfile
-try:
- from urllib.parse import unquote
-except ImportError:
- from urllib import unquote
-
+import urllib.parse
import concurrent.futures
-from debian import deb822
from debian.debian_support import version_compare
-from dedup.utils import open_compressed_mirror_url
+from dedup.utils import iterate_packages
from readyaml import readyaml
def process_http(pkgs, url, addhash=True):
- url += "/dists/sid/main/binary-amd64/Packages"
- with contextlib.closing(open_compressed_mirror_url(url)) as pkglist:
- for pkg in deb822.Packages.iter_paragraphs(pkglist):
- name = pkg["Package"]
- if name in pkgs and \
- version_compare(pkgs[name]["version"], pkg["Version"]) > 0:
- continue
- inst = dict(version=pkg["Version"],
- filename="%s/%s" % (url, pkg["Filename"]))
- if addhash:
- inst["sha256hash"] = pkg["SHA256"]
- pkgs[name] = inst
+ for pkg in iterate_packages(url, "amd64"):
+ name = pkg["Package"]
+ if name in pkgs and \
+ version_compare(pkgs[name]["version"], pkg["Version"]) > 0:
+ continue
+ inst = dict(version=pkg["Version"],
+ filename="%s/%s" % (url, pkg["Filename"]))
+ if addhash:
+ inst["sha256hash"] = pkg["SHA256"]
+ pkgs[name] = inst
def process_file(pkgs, filename):
- base = os.path.basename(filename)
- if not base.endswith(".deb"):
+ if filename.suffix != ".deb":
raise ValueError("filename does not end in .deb")
- parts = base.split("_")
+ parts = filename.name.split("_")
if len(parts) != 3:
raise ValueError("filename not in form name_version_arch.deb")
name, version, _ = parts
- version = unquote(version)
+ version = urllib.parse.unquote(version)
if name in pkgs and version_compare(pkgs[name]["version"], version) > 0:
return
- pkgs[name] = dict(version=version, filename=filename)
+ pkgs[name] = dict(version=version, filename=str(filename))
def process_dir(pkgs, d):
- for entry in os.listdir(d):
+ for entry in d.iterdir():
try:
- process_file(pkgs, os.path.join(d, entry))
+ process_file(pkgs, entry)
except ValueError:
pass
@@ -67,11 +58,11 @@ def process_pkg(name, pkgdict, outpath):
importcmd.extend(["-H", pkgdict["sha256hash"]])
if filename.startswith(("http://", "https://", "ftp://", "file://")):
importcmd.append(filename)
- with open(outpath, "w") as outp:
+ with outpath.open("w") as outp:
subprocess.check_call(importcmd, stdout=outp, close_fds=True)
else:
with open(filename) as inp:
- with open(outpath, "w") as outp:
+ with outpath.open("w") as outp:
subprocess.check_call(importcmd, stdin=inp, stdout=outp,
close_fds=True)
print("preprocessed %s" % name)
@@ -90,7 +81,7 @@ def main():
parser.add_argument("files", nargs='+',
help="files or directories or repository urls")
args = parser.parse_args()
- tmpdir = tempfile.mkdtemp(prefix="debian-dedup")
+ tmpdir = pathlib.Path(tempfile.mkdtemp(prefix="debian-dedup"))
db = sqlite3.connect(args.database)
cur = db.cursor()
cur.execute("PRAGMA foreign_keys = ON;")
@@ -100,42 +91,44 @@ def main():
print("processing %s" % d)
if d.startswith(("http://", "https://", "ftp://", "file://")):
process_http(pkgs, d, not args.noverify)
- elif os.path.isdir(d):
- process_dir(pkgs, d)
else:
- process_file(pkgs, d)
+ dp = pathlib.Path(d)
+ if dp.is_dir():
+ process_dir(pkgs, dp)
+ else:
+ process_file(pkgs, dp)
print("reading database")
cur.execute("SELECT name, version FROM package;")
- knownpkgs = dict((row[0], row[1]) for row in cur.fetchall())
+ knownpkgvers = dict((row[0], row[1]) for row in cur.fetchall())
distpkgs = set(pkgs.keys())
if args.new:
for name in distpkgs:
- if name in knownpkgs and version_compare(pkgs[name]["version"],
- knownpkgs[name]) <= 0:
+ if name in knownpkgvers and \
+ version_compare(pkgs[name]["version"], knownpkgvers[name]) <= 0:
del pkgs[name]
- knownpkgs = set(knownpkgs)
+ knownpkgs = set(knownpkgvers)
+ del knownpkgvers
with e:
fs = {}
for name, pkg in pkgs.items():
- outpath = os.path.join(tmpdir, name)
- fs[e.submit(process_pkg, name, pkg, outpath)] = name
+ fs[e.submit(process_pkg, name, pkg, tmpdir / name)] = name
for f in concurrent.futures.as_completed(fs.keys()):
name = fs[f]
if f.exception():
print("%s failed to import: %r" % (name, f.exception()))
continue
- inf = os.path.join(tmpdir, name)
+ inf = tmpdir / name
print("sqlimporting %s" % name)
- with open(inf) as inp:
+ with inf.open() as inp:
try:
readyaml(db, inp)
except Exception as exc:
print("%s failed sql with exception %r" % (name, exc))
else:
- os.unlink(inf)
+ inf.unlink()
if args.prune:
delpkgs = knownpkgs - distpkgs
@@ -146,12 +139,12 @@ def main():
# due to ON DELETE CASCADE clauses.
db.commit()
try:
- os.rmdir(tmpdir)
+ tmpdir.rmdir()
except OSError as err:
if err.errno != errno.ENOTEMPTY:
raise
print("keeping temporary directory %s due to failed packages %s" %
- (tmpdir, " ".join(os.listdir(tmpdir))))
+ (tmpdir, " ".join(map(str, tmpdir.iterdir()))))
if __name__ == "__main__":
main()
diff --git a/dedup/arreader.py b/dedup/arreader.py
deleted file mode 100644
index e53efd9..0000000
--- a/dedup/arreader.py
+++ /dev/null
@@ -1,79 +0,0 @@
-import struct
-
-class ArReader(object):
- """Streaming AR file reader. After constructing an object, you usually
- call read_magic once. Then you call read_entry in a loop and use the
- ArReader object as file-like only providing read() to read the respective
- file contents until you get EOFError from read_entry.
- """
- global_magic = b"!<arch>\n"
- file_magic = b"`\n"
-
- def __init__(self, fileobj):
- """
- @param fileobj: a file-like object providing nothing but read(length)
- """
- self.fileobj = fileobj
- self.remaining = None
- self.padding = 0
-
- def read_magic(self):
- """Consume the AR magic marker at the beginning of an AR file. You
- must not call any other method before calling this method.
- @raises ValueError: if the magic is not found
- """
- data = self.fileobj.read(len(self.global_magic))
- if data != self.global_magic:
- raise ValueError("ar global header not found")
- self.remaining = 0
-
- def read_entry(self):
- """Read the next file header, return the filename and record the
- length of the next file, so that the read method can be used to
- exhaustively read the current file.
- @rtype: bytes
- @returns: the name of the next file
- @raises ValueError: if the data format is wrong
- @raises EOFError: when the end f the stream is reached
- """
- self.skip_current_entry()
- if self.padding:
- if self.fileobj.read(1) != b'\n':
- raise ValueError("missing ar padding")
- self.padding = 0
- file_header = self.fileobj.read(60)
- if not file_header:
- raise EOFError("end of archive found")
- parts = struct.unpack("16s 12s 6s 6s 8s 10s 2s", file_header)
- parts = [p.rstrip(b"/ ") for p in parts]
- if parts.pop() != self.file_magic:
- raise ValueError("ar file header not found")
- self.remaining = int(parts[5])
- self.padding = self.remaining % 2
- return parts[0] # name
-
- def skip_current_entry(self):
- """Skip the remainder of the current file. This method must not be
- called before calling read_entry.
- @raises ValueError: if the archive appears truncated
- """
- while self.remaining:
- data = self.fileobj.read(min(4096, self.remaining))
- if not data:
- raise ValueError("archive truncated")
- self.remaining -= len(data)
-
- def read(self, length=None):
- """
- @type length: int or None
- @param length: number of bytes to read from the current file
- @rtype: bytes
- @returns: length or fewer bytes from the current file
- """
- if length is None:
- length = self.remaining
- else:
- length = min(self.remaining, length)
- data = self.fileobj.read(length)
- self.remaining -= len(data)
- return data
diff --git a/dedup/compression.py b/dedup/compression.py
index 8d1912b..da6e9a0 100644
--- a/dedup/compression.py
+++ b/dedup/compression.py
@@ -1,13 +1,10 @@
import bz2
import struct
-import sys
import zlib
import lzma
-crc32_type = "L" if sys.version_info.major >= 3 else "l"
-
-class GzipDecompressor(object):
+class GzipDecompressor:
"""An interface to gzip which is similar to bz2.BZ2Decompressor and
lzma.LZMADecompressor."""
def __init__(self):
@@ -66,7 +63,7 @@ class GzipDecompressor(object):
elif not self.sawheader:
return self.inbuffer
else:
- expect = struct.pack("<" + crc32_type + "L", self.crc, self.size)
+ expect = struct.pack("<LL", self.crc, self.size)
if self.inbuffer.startswith(expect) and \
self.inbuffer[len(expect):].replace(b"\0", b"") == b"":
return b""
@@ -90,7 +87,7 @@ class GzipDecompressor(object):
new.size = self.size
return new
-class DecompressedStream(object):
+class DecompressedStream:
"""Turn a readable file-like into a decompressed file-like. It supports
read(optional length), tell, seek(forward only) and close."""
blocksize = 65536
@@ -104,25 +101,23 @@ class DecompressedStream(object):
"""
self.fileobj = fileobj
self.decompressor = decompressor
- self.buff = b""
+ self.buff = bytearray()
self.pos = 0
- self.closed = False
def _fill_buff_until(self, predicate):
- assert not self.closed
- data = True
- while True:
- if predicate(self.buff) or not data:
- return
+ assert self.fileobj is not None
+ while not predicate(self.buff):
data = self.fileobj.read(self.blocksize)
if data:
self.buff += self.decompressor.decompress(data)
- elif hasattr(self.decompressor, "flush"):
- self.buff += self.decompressor.flush()
+ else:
+ if hasattr(self.decompressor, "flush"):
+ self.buff += self.decompressor.flush()
+ break
def _read_from_buff(self, length):
- ret = self.buff[:length]
- self.buff = self.buff[length:]
+ ret = bytes(self.buff[:length])
+ self.buff[:length] = b""
self.pos += length
return ret
@@ -146,12 +141,12 @@ class DecompressedStream(object):
return iter(self.readline, b'')
def tell(self):
- assert not self.closed
+ assert self.fileobj is not None
return self.pos
def seek(self, pos):
"""Forward seeks by absolute position only."""
- assert not self.closed
+ assert self.fileobj is not None
if pos < self.pos:
raise ValueError("negative seek not allowed on decompressed stream")
while True:
@@ -165,18 +160,17 @@ class DecompressedStream(object):
return
def close(self):
- if not self.closed:
+ if self.fileobj is not None:
self.fileobj.close()
self.fileobj = None
self.decompressor = None
- self.buff = b""
- self.closed = True
+ self.buff = bytearray()
decompressors = {
- u'.gz': GzipDecompressor,
- u'.bz2': bz2.BZ2Decompressor,
- u'.lzma': lzma.LZMADecompressor,
- u'.xz': lzma.LZMADecompressor,
+ '.gz': GzipDecompressor,
+ '.bz2': bz2.BZ2Decompressor,
+ '.lzma': lzma.LZMADecompressor,
+ '.xz': lzma.LZMADecompressor,
}
def decompress(filelike, extension):
diff --git a/dedup/debpkg.py b/dedup/debpkg.py
index 3a30b3e..de00e60 100644
--- a/dedup/debpkg.py
+++ b/dedup/debpkg.py
@@ -1,13 +1,12 @@
-import sys
import tarfile
+import arpy
from debian import deb822
-from dedup.arreader import ArReader
from dedup.compression import decompress
from dedup.hashing import hash_file
-class MultiHash(object):
+class MultiHash:
def __init__(self, *hashes):
self.hashes = hashes
@@ -30,7 +29,7 @@ def get_tar_hashes(tar, hash_functions):
if not elem.isreg(): # excludes hard links as well
continue
hasher = MultiHash(*[func() for func in hash_functions])
- hasher = hash_file(hasher, tar.extractfile(elem))
+ hash_file(hasher, tar.extractfile(elem))
hashes = {}
for hashobj in hasher.hashes:
hashvalue = hashobj.hexdigest()
@@ -38,32 +37,11 @@ def get_tar_hashes(tar, hash_functions):
hashes[hashobj.name] = hashvalue
yield (elem.name, elem.size, hashes)
-if sys.version_info.major >= 3:
- def opentar(filelike):
- return tarfile.open(fileobj=filelike, mode="r|", encoding="utf8",
- errors="surrogateescape")
+def opentar(filelike):
+ return tarfile.open(fileobj=filelike, mode="r|", encoding="utf8",
+ errors="surrogateescape")
- def decodetarname(name):
- """Decoded name of a tarinfo.
- @raises UnicodeDecodeError:
- """
- try:
- name.encode("utf8", "strict")
- except UnicodeEncodeError as e:
- if e.reason == "surrogates not allowed":
- name.encode("utf8", "surrogateescape").decode("utf8", "strict")
- return name
-else:
- def opentar(filelike):
- return tarfile.open(fileobj=filelike, mode="r|")
-
- def decodetarname(name):
- """Decoded name of a tarinfo.
- @raises UnicodeDecodeError:
- """
- return name.decode("utf8")
-
-class DebExtractor(object):
+class DebExtractor:
"Base class for extracting desired features from a Debian package."
def __init__(self):
@@ -74,45 +52,36 @@ class DebExtractor(object):
@param filelike: is a file-like object containing the contents of the
Debian packge and can be read once without seeks.
"""
- af = ArReader(filelike)
- af.read_magic()
- while True:
- try:
- name = af.read_entry()
- except EOFError:
- break
- else:
- self.handle_ar_member(name, af)
+ af = arpy.Archive(fileobj=filelike)
+ for member in af:
+ self.handle_ar_member(member)
self.handle_ar_end()
- def handle_ar_member(self, name, filelike):
+ def handle_ar_member(self, arfiledata: arpy.ArchiveFileData) -> None:
"""Handle an ar archive member of the Debian package.
If you replace this method, you must also replace handle_ar_end and
none of the methods handle_debversion, handle_control_tar or
handle_data_tar are called.
- @type name: bytes
- @param name: is the name of the member
- @param filelike: is a file-like object containing the contents of the
- member and can be read once without seeks.
"""
+ name = arfiledata.header.name
if self.arstate == "start":
if name != b"debian-binary":
raise ValueError("debian-binary not found")
- version = filelike.read()
+ version = arfiledata.read()
self.handle_debversion(version)
if not version.startswith(b"2."):
raise ValueError("debian version not recognized")
self.arstate = "version"
elif self.arstate == "version":
if name.startswith(b"control.tar"):
- filelike = decompress(filelike, name[11:].decode("ascii"))
+ filelike = decompress(arfiledata, name[11:].decode("ascii"))
self.handle_control_tar(opentar(filelike))
self.arstate = "control"
elif not name.startswith(b"_"):
raise ValueError("unexpected ar member %r" % name)
elif self.arstate == "control":
if name.startswith(b"data.tar"):
- filelike = decompress(filelike, name[8:].decode("ascii"))
+ filelike = decompress(arfiledata, name[8:].decode("ascii"))
self.handle_data_tar(opentar(filelike))
self.arstate = "data"
elif not name.startswith(b"_"):
diff --git a/dedup/filemagic.py b/dedup/filemagic.py
new file mode 100644
index 0000000..b71c276
--- /dev/null
+++ b/dedup/filemagic.py
@@ -0,0 +1,49 @@
+"""A very strange "hash" that uses the magic module (python3-magic) to guess
+the file type."""
+
+import magic
+
+# It changed API a few times...
+try:
+ _magic_identify = magic.from_buffer
+except AttributeError:
+ _magic_identify = magic.none_magic.buffer
+
+class FileDigester:
+ """A hashlib-like class to guess a filetype using the magic module."""
+ FILE_BYTES_MAX = 1024 * 1024 # copied from file source
+
+ def __init__(self):
+ self.buff = b""
+ self.identification = None
+
+ def _compute_identification(self):
+ try:
+ return _magic_identify(self.buff)
+ except UnicodeDecodeError:
+ return "magic identification is not valid UTF-8"
+
+ def update(self, buff):
+ if self.identification:
+ return
+ self.buff += buff
+ if len(self.buff) >= self.FILE_BYTES_MAX:
+ self.identification = self._compute_identification()
+ self.buff = None
+
+ def identify(self):
+ """Return the guessed file magic identification."""
+ if self.identification:
+ return self.identification
+ return self._compute_identification()
+
+ def hexdigest(self):
+ """Compatibility with hashlib. An alias of identify. Doesn't return
+ hex."""
+ return self.identify()
+
+ def copy(self):
+ new = FileDigester()
+ new.buff = self.buff
+ new.identification = self.identification
+ return new
diff --git a/dedup/hashing.py b/dedup/hashing.py
index 2a83929..9cebcbb 100644
--- a/dedup/hashing.py
+++ b/dedup/hashing.py
@@ -1,10 +1,6 @@
import itertools
-try:
- from itertools import imap as map
-except ImportError:
- pass # in python3 map is already imap
-class HashBlacklist(object):
+class HashBlacklist:
"""Turn a hashlib-like object into a hash that returns None for some
blacklisted hashes instead of the real hash value.
@@ -35,7 +31,7 @@ class HashBlacklist(object):
def copy(self):
return HashBlacklist(self.hashobj.copy(), self.blacklist)
-class HashBlacklistContent(object):
+class HashBlacklistContent:
"""Turn a hashlib-like object into a hash that returns None for some
blacklisted content instead of the real hash value. Unlike HashBlacklist,
not the output of the hash is considered, but its input."""
@@ -80,13 +76,15 @@ class HashBlacklistContent(object):
return self.hashobj.hexdigest()
def copy(self):
- return HashBlacklistContent(self.hashobj.copy(), self.blacklist,
- self.maxlen)
+ new = HashBlacklistContent(self.hashobj.copy(), self.blacklist,
+ self.maxlen)
+ new.stored = self.stored
+ return new
-class DecompressedHash(object):
+class DecompressedHash:
"""Apply a decompression function before the hash. This class provides the
hashlib interface (update, hexdigest, copy) excluding digest and name."""
- def __init__(self, decompressor, hashobj):
+ def __init__(self, decompressor, hashobj, name="unnamed"):
"""
@param decompressor: a decompression object like bz2.BZ2Decompressor or
lzma.LZMADecompressor. It has to provide methods decompress and
@@ -94,9 +92,11 @@ class DecompressedHash(object):
method.
@param hashobj: a hashlib-like obj providing methods update, hexdigest
and copy
+ @param name: initialized the name property
"""
self.decompressor = decompressor
self.hashobj = hashobj
+ self.name = name
def update(self, data):
self.hashobj.update(self.decompressor.decompress(data))
@@ -115,9 +115,10 @@ class DecompressedHash(object):
return tmphash.hexdigest()
def copy(self):
- return DecompressedHash(self.decompressor.copy(), self.hashobj.copy())
+ return DecompressedHash(self.decompressor.copy(), self.hashobj.copy(),
+ self.name)
-class SuppressingHash(object):
+class SuppressingHash:
"""A hash that silences exceptions from the update and hexdigest methods of
a hashlib-like object. If an exception has occurred, hexdigest always
returns None."""
@@ -163,9 +164,8 @@ def hash_file(hashobj, filelike, blocksize=65536):
while data:
hashobj.update(data)
data = filelike.read(blocksize)
- return hashobj
-class HashedStream(object):
+class HashedStream:
"""A file-like object, that supports sequential reading and hashes the
contents on the fly."""
def __init__(self, filelike, hashobj):
diff --git a/dedup/image.py b/dedup/image.py
index 314eb44..91321f4 100644
--- a/dedup/image.py
+++ b/dedup/image.py
@@ -3,7 +3,7 @@ import struct
import PIL.Image
-class ImageHash(object):
+class ImageHash:
"""A hash on the contents of an image data type supported by PIL. This
disregards mode, depth and meta information. Note that due to limitations
in PIL and the image format (interlacing) the full contents are stored and
@@ -69,9 +69,14 @@ class ImageHash(object):
self.content.seek(pos)
return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height)
+ @property
+ def name(self):
+ return self.name_prefix + self.hashobj.name
+
class PNGHash(ImageHash):
"""A hash on the contents of a PNG image."""
+ name_prefix = "png_"
def detect(self):
if self.content.tell() < 33: # header + IHDR
@@ -86,6 +91,7 @@ class PNGHash(ImageHash):
class GIFHash(ImageHash):
"""A hash on the contents of the first frame of a GIF image."""
+ name_prefix = "gif_"
def detect(self):
if self.content.tell() < 10: # magic + logical dimension
diff --git a/dedup/templates/base.html b/dedup/templates/base.html
index 9dfb788..bac516e 100644
--- a/dedup/templates/base.html
+++ b/dedup/templates/base.html
@@ -15,7 +15,7 @@
<ul>
<li>More information: <a href="https://wiki.debian.org/dedup.debian.net">see wiki</a></li>
<li>Maintainer: Helmut Grohne &lt;helmut@subdivi.de&gt;</li>
- <li>Source: git://murkel.subdivi.de/~helmut/debian-dedup.git</li>
+ <li>Source: git://git.subdivi.de/~helmut/debian-dedup.git</li>
<li>Bugs reports / Feedback / Patches: to the maintainer</li>
</ul>
</body>
diff --git a/dedup/utils.py b/dedup/utils.py
index dab6653..55cdef0 100644
--- a/dedup/utils.py
+++ b/dedup/utils.py
@@ -1,29 +1,19 @@
+import contextlib
import errno
-try:
- from urllib.error import URLError, HTTPError
-except ImportError:
- from urllib2 import URLError, HTTPError
-try:
- from urllib.request import urlopen
-except ImportError:
- from urllib2 import urlopen
+import urllib.error
+import urllib.request
-from debian.debian_support import version_compare
+import debian.deb822
from dedup.compression import decompress
def fetchiter(cursor):
rows = cursor.fetchmany()
while rows:
- for row in rows:
- yield row
+ yield from rows
rows = cursor.fetchmany()
-def sql_add_version_compare(db):
- db.create_collation("debian_version", version_compare)
- db.create_function("debian_version_compare", 2, version_compare)
-
-def open_compressed_mirror_url(url, extensions=(u".xz", u".gz", u"")):
+def open_compressed_mirror_url(url, extensions=(".xz", ".gz", "")):
"""Fetch the given url. Try appending each of the given compression
schemes and move on in case it doesn't exist. Decompress the resulting
stream on the fly.
@@ -31,11 +21,11 @@ def open_compressed_mirror_url(url, extensions=(u".xz", u".gz", u"")):
"""
for ext in extensions:
try:
- handle = urlopen(url + ext)
- except HTTPError as error:
+ handle = urllib.request.urlopen(url + ext)
+ except urllib.error.HTTPError as error:
if error.code != 404:
raise
- except URLError as error:
+ except urllib.error.URLError as error:
if not hasattr(error.reason, "errno"):
raise
if error.reason.errno != errno.ENOENT:
@@ -43,3 +33,11 @@ def open_compressed_mirror_url(url, extensions=(u".xz", u".gz", u"")):
else:
return decompress(handle, ext)
raise OSError(errno.ENOENT, "No such file or directory")
+
+def iterate_packages(mirror, architecture, distribution="sid", section="main"):
+ """Download the relevant binary package list and generate
+ debian.deb822.Packages objects per listed package."""
+ url = "%s/dists/%s/%s/binary-%s/Packages" % \
+ (mirror, distribution, section, architecture)
+ with contextlib.closing(open_compressed_mirror_url(url)) as pkglist:
+ yield from debian.deb822.Packages.iter_paragraphs(pkglist)
diff --git a/importpkg.py b/importpkg.py
index ce4a446..7bca70b 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python3
"""This tool reads a Debian package from stdin and emits a yaml stream on
stdout. It does not access a database. Therefore it can be run in parallel and
on multiple machines. The generated yaml contains multiple documents. The first
@@ -8,42 +8,33 @@ And finally a document consisting of the string "commit" is emitted."""
import argparse
import hashlib
import sys
+import urllib.request
import zlib
-try:
- from urllib.request import urlopen
-except ImportError:
- from urllib2 import urlopen
import yaml
-from dedup.debpkg import DebExtractor, decodetarname, get_tar_hashes
+from dedup.debpkg import DebExtractor, get_tar_hashes
from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \
HashBlacklistContent
from dedup.compression import GzipDecompressor
from dedup.image import GIFHash, PNGHash
-boring_content = set(("", "\n"))
+boring_content = set((b"", b"\n"))
def sha512_nontrivial():
return HashBlacklistContent(hashlib.sha512(), boring_content)
def gziphash():
- hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
+ hashobj = hashlib.sha512()
+ hashobj = DecompressedHash(GzipDecompressor(), hashobj, "gzip_sha512")
hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
- hashobj.name = "gzip_sha512"
return HashBlacklistContent(hashobj, boring_content)
def pnghash():
- hashobj = PNGHash(hashlib.sha512())
- hashobj = SuppressingHash(hashobj, (ValueError,))
- hashobj.name = "png_sha512"
- return hashobj
+ return SuppressingHash(PNGHash(hashlib.sha512()), (ValueError,))
def gifhash():
- hashobj = GIFHash(hashlib.sha512())
- hashobj = SuppressingHash(hashobj, (ValueError,))
- hashobj.name = "gif_sha512"
- return hashobj
+ return SuppressingHash(GIFHash(hashlib.sha512()), (ValueError,))
class ProcessingFinished(Exception):
pass
@@ -63,7 +54,7 @@ class ImportpkgExtractor(DebExtractor):
# deb822 currently returns :any dependencies raw. see #670679
deprelations = info.relations.get("depends", []) + \
info.relations.get("pre-depends", [])
- depends = set(dep[0]["name"].split(u':', 1)[0]
+ depends = set(dep[0]["name"].split(':', 1)[0]
for dep in deprelations if len(dep) == 1)
self.callback(dict(package=info["package"], source=source,
version=info["version"],
@@ -73,22 +64,19 @@ class ImportpkgExtractor(DebExtractor):
for name, size, hashes in get_tar_hashes(tarfileobj,
self.hash_functions):
try:
- name = decodetarname(name)
- except UnicodeDecodeError:
+ name.encode("utf8", "strict")
+ except UnicodeEncodeError:
print("warning: skipping filename with encoding error")
continue # skip files with non-utf8 encoding for now
self.callback(dict(name=name, size=size, hashes=hashes))
raise ProcessingFinished()
def main():
- try:
- stdin = sys.stdin.buffer
- except AttributeError: # python2
- stdin = sys.stdin
parser = argparse.ArgumentParser()
parser.add_argument("-H", "--hash", action="store",
help="verify that stdin hash given sha256 hash")
- parser.add_argument("input", nargs='?', default=stdin, type=urlopen,
+ parser.add_argument("input", nargs='?', default=sys.stdin.buffer,
+ type=urllib.request.urlopen,
help="read from this location instead of stdin")
args = parser.parse_args()
dumper = yaml.SafeDumper(sys.stdout)
diff --git a/readyaml.py b/readyaml.py
index b6f7316..a4837cf 100755
--- a/readyaml.py
+++ b/readyaml.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python3
"""This tool reads a yaml file as generated by importpkg.py on stdin and
updates the database with the contents."""
diff --git a/update_sharing.py b/update_sharing.py
index ac6c945..78e6171 100755
--- a/update_sharing.py
+++ b/update_sharing.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python3
import argparse
import sqlite3
diff --git a/webapp.py b/webapp.py
index f9e667e..162a5a4 100755
--- a/webapp.py
+++ b/webapp.py
@@ -1,8 +1,9 @@
-#!/usr/bin/python
+#!/usr/bin/python3
import argparse
import contextlib
import datetime
+import io
import sqlite3
from wsgiref.simple_server import make_server
@@ -11,25 +12,28 @@ from werkzeug.exceptions import HTTPException, NotFound
from werkzeug.routing import Map, Rule
from werkzeug.utils import redirect
from werkzeug.wrappers import Request, Response
-from werkzeug.wsgi import SharedDataMiddleware
+try:
+ from werkzeug.middleware.shared_data import SharedDataMiddleware
+except ImportError:
+ from werkzeug.wsgi import SharedDataMiddleware
from dedup.utils import fetchiter
jinjaenv = jinja2.Environment(loader=jinja2.PackageLoader("dedup", "templates"))
def format_size(size):
- size = float(size)
+ sizef = float(size)
fmt = "%d B"
- if size >= 1024:
- size /= 1024
+ if sizef >= 1024:
+ sizef /= 1024
fmt = "%.1f KB"
- if size >= 1024:
- size /= 1024
+ if sizef >= 1024:
+ sizef /= 1024
fmt = "%.1f MB"
- if size >= 1024:
- size /= 1024
+ if sizef >= 1024:
+ sizef /= 1024
fmt = "%.1f GB"
- return fmt % size
+ return fmt % sizef
def function_combination(function1, function2):
if function1 == function2:
@@ -46,15 +50,16 @@ hash_template = jinjaenv.get_template("hash.html")
index_template = jinjaenv.get_template("index.html")
source_template = jinjaenv.get_template("source.html")
-def encode_and_buffer(iterator):
- buff = b""
- for elem in iterator:
- buff += elem.encode("utf8")
- if len(buff) >= 2048:
- yield buff
- buff = b""
- if buff:
- yield buff
+def encode_and_buffer(stream):
+ stream.enable_buffering(16)
+ buff = io.BytesIO()
+ for elem in stream:
+ buff.write(elem.encode("utf8"))
+ if buff.tell() >= 2048:
+ yield buff.getvalue()
+ buff = io.BytesIO()
+ if buff.tell() > 0:
+ yield buff.getvalue()
def html_response(unicode_iterator, max_age=24 * 60 * 60):
resp = Response(encode_and_buffer(unicode_iterator), mimetype="text/html")
@@ -68,7 +73,7 @@ class InternalRedirect(Exception):
self.target = target
self.code = code
-class Application(object):
+class Application:
def __init__(self, db):
self.db = db
self.routingmap = Map([
@@ -79,6 +84,9 @@ class Application(object):
Rule("/source/<package>", methods=("GET",), endpoint="source"),
])
+ def cursor(self):
+ return contextlib.closing(self.db.cursor())
+
@Request.application
def __call__(self, request):
mapadapter = self.routingmap.bind_to_environ(request.environ)
@@ -97,7 +105,7 @@ class Application(object):
elif endpoint == "index":
if not request.environ["PATH_INFO"]:
raise InternalRedirect("/")
- return html_response(index_template.render(dict(urlroot="")))
+ return html_response(index_template.stream(dict(urlroot="")))
elif endpoint == "source":
return self.show_source(args["package"])
raise NotFound()
@@ -107,7 +115,7 @@ class Application(object):
return e
def get_details(self, package):
- with contextlib.closing(self.db.cursor()) as cur:
+ with self.cursor() as cur:
cur.execute("SELECT id, version, architecture FROM package WHERE name = ?;",
(package,))
row = cur.fetchone()
@@ -127,14 +135,14 @@ class Application(object):
return details
def get_dependencies(self, pid):
- with contextlib.closing(self.db.cursor()) as cur:
+ with self.cursor() as cur:
cur.execute("SELECT required FROM dependency WHERE pid = ?;",
(pid,))
return set(row[0] for row in fetchiter(cur))
def cached_sharedstats(self, pid):
sharedstats = {}
- with contextlib.closing(self.db.cursor()) as cur:
+ with self.cursor() as cur:
cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;",
(pid,))
for pid2, package2, func1, func2, files, size in fetchiter(cur):
@@ -151,12 +159,11 @@ class Application(object):
params["dependencies"] = self.get_dependencies(params["pid"])
params["shared"] = self.cached_sharedstats(params["pid"])
params["urlroot"] = ".."
- cur = self.db.cursor()
- cur.execute("SELECT content.filename, issue.issue FROM content JOIN issue ON content.id = issue.cid WHERE content.pid = ?;",
- (params["pid"],))
- params["issues"] = dict(cur.fetchall())
- cur.close()
- return html_response(package_template.render(params))
+ with self.cursor() as cur:
+ cur.execute("SELECT content.filename, issue.issue FROM content JOIN issue ON content.id = issue.cid WHERE content.pid = ?;",
+ (params["pid"],))
+ params["issues"] = dict(cur.fetchall())
+ return html_response(package_template.stream(params))
def compute_comparison(self, pid1, pid2):
"""Compute a sequence of comparison objects ordered by the size of the
@@ -168,35 +175,33 @@ class Application(object):
* matches: A mapping from filenames in package 2 (pid2) to a mapping
from hash function pairs to hash values.
"""
- cur = self.db.cursor()
- cur.execute("SELECT content.id, content.filename, content.size, hash.hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid JOIN function ON hash.fid = function.id WHERE pid = ? AND function.name = 'sha512' ORDER BY size DESC;",
- (pid1,))
- cursize = -1
- files = dict()
- minmatch = 2 if pid1 == pid2 else 1
- cur2 = self.db.cursor()
- for cid, filename, size, hashvalue in fetchiter(cur):
- if cursize != size:
- for entry in files.values():
- if len(entry["matches"]) >= minmatch:
- yield entry
- files.clear()
- cursize = size
+ with self.cursor() as cur, self.cursor() as cur2:
+ cur.execute("SELECT content.id, content.filename, content.size, hash.hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid JOIN function ON hash.fid = function.id WHERE pid = ? AND function.name = 'sha512' ORDER BY size DESC;",
+ (pid1,))
+ cursize = -1
+ files = dict()
+ minmatch = 2 if pid1 == pid2 else 1
+ cur2 = self.db.cursor()
+ for cid, filename, size, hashvalue in fetchiter(cur):
+ if cursize != size:
+ for entry in files.values():
+ if len(entry["matches"]) >= minmatch:
+ yield entry
+ files.clear()
+ cursize = size
- if hashvalue in files:
- files[hashvalue]["filenames"].add(filename)
- continue
+ if hashvalue in files:
+ files[hashvalue]["filenames"].add(filename)
+ continue
- entry = dict(filenames=set((filename,)), size=size, matches={})
- files[hashvalue] = entry
+ entry = dict(filenames=set((filename,)), size=size, matches={})
+ files[hashvalue] = entry
- cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ? AND fa.eqclass = fb.eqclass;",
- (cid, pid2))
- for func1, hashvalue, func2, filename in fetchiter(cur2):
- entry["matches"].setdefault(filename, {})[func1, func2] = \
- hashvalue
- cur2.close()
- cur.close()
+ cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ? AND fa.eqclass = fb.eqclass;",
+ (cid, pid2))
+ for func1, hashvalue, func2, filename in fetchiter(cur2):
+ entry["matches"].setdefault(filename, {})[func1, func2] = \
+ hashvalue
for entry in files.values():
if len(entry["matches"]) >= minmatch:
@@ -216,7 +221,7 @@ class Application(object):
return html_response(detail_template.stream(params))
def show_hash(self, function, hashvalue):
- with contextlib.closing(self.db.cursor()) as cur:
+ with self.cursor() as cur:
cur.execute("SELECT package.name, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;",
(function, hashvalue,))
entries = [dict(package=package, filename=filename, size=size,
@@ -234,10 +239,10 @@ class Application(object):
raise NotFound()
params = dict(function=function, hashvalue=hashvalue, entries=entries,
urlroot="../..")
- return html_response(hash_template.render(params))
+ return html_response(hash_template.stream(params))
def show_source(self, package):
- with contextlib.closing(self.db.cursor()) as cur:
+ with self.cursor() as cur:
cur.execute("SELECT name FROM package WHERE source = ?;",
(package,))
binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur))
@@ -253,7 +258,7 @@ class Application(object):
if not (oldentry and oldentry["savable"] >= size):
binpkgs[binary] = entry
params = dict(source=package, packages=binpkgs, urlroot="..")
- return html_response(source_template.render(params))
+ return html_response(source_template.stream(params))
def main():
parser = argparse.ArgumentParser()