summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2020-10-25 10:20:34 +0100
committerHelmut Grohne <helmut@subdivi.de>2020-10-25 10:20:34 +0100
commit4542d84439bbc6bd8f3151a9cb61d0ee85cd910e (patch)
treed615a378d7c4dc0113f23619dead9a6a099eecb3
parentb4251e2cc3453852d93ad6a2c0c116991982c2f9 (diff)
downloaddebian-dedup-4542d84439bbc6bd8f3151a9cb61d0ee85cd910e.tar.gz
externalize ar parsing to arpy
-rw-r--r--README2
-rw-r--r--dedup/arreader.py79
-rw-r--r--dedup/debpkg.py27
3 files changed, 10 insertions, 98 deletions
diff --git a/README b/README
index 4572c8a..ed4e8cb 100644
--- a/README
+++ b/README
@@ -1,7 +1,7 @@
Required packages
-----------------
- aptitude install python3 python3-debian python3-lzma python3-jinja2 python3-werkzeug sqlite3 python3-pil python3-yaml python3-concurrent.futures python3-pkg-resources
+ aptitude install python3 python3-arpy python3-debian python3-lzma python3-jinja2 python3-werkzeug sqlite3 python3-pil python3-yaml python3-concurrent.futures python3-pkg-resources
Create a database
-----------------
diff --git a/dedup/arreader.py b/dedup/arreader.py
deleted file mode 100644
index 8b14ff9..0000000
--- a/dedup/arreader.py
+++ /dev/null
@@ -1,79 +0,0 @@
-import struct
-
-class ArReader:
- """Streaming AR file reader. After constructing an object, you usually
- call read_magic once. Then you call read_entry in a loop and use the
- ArReader object as file-like only providing read() to read the respective
- file contents until you get EOFError from read_entry.
- """
- global_magic = b"!<arch>\n"
- file_magic = b"`\n"
-
- def __init__(self, fileobj):
- """
- @param fileobj: a file-like object providing nothing but read(length)
- """
- self.fileobj = fileobj
- self.remaining = None
- self.padding = 0
-
- def read_magic(self):
- """Consume the AR magic marker at the beginning of an AR file. You
- must not call any other method before calling this method.
- @raises ValueError: if the magic is not found
- """
- data = self.fileobj.read(len(self.global_magic))
- if data != self.global_magic:
- raise ValueError("ar global header not found")
- self.remaining = 0
-
- def read_entry(self):
- """Read the next file header, return the filename and record the
- length of the next file, so that the read method can be used to
- exhaustively read the current file.
- @rtype: bytes
- @returns: the name of the next file
- @raises ValueError: if the data format is wrong
- @raises EOFError: when the end f the stream is reached
- """
- self.skip_current_entry()
- if self.padding:
- if self.fileobj.read(1) != b'\n':
- raise ValueError("missing ar padding")
- self.padding = 0
- file_header = self.fileobj.read(60)
- if not file_header:
- raise EOFError("end of archive found")
- parts = struct.unpack("16s 12s 6s 6s 8s 10s 2s", file_header)
- parts = [p.rstrip(b"/ ") for p in parts]
- if parts.pop() != self.file_magic:
- raise ValueError("ar file header not found")
- self.remaining = int(parts[5])
- self.padding = self.remaining % 2
- return parts[0] # name
-
- def skip_current_entry(self):
- """Skip the remainder of the current file. This method must not be
- called before calling read_entry.
- @raises ValueError: if the archive appears truncated
- """
- while self.remaining:
- data = self.fileobj.read(min(4096, self.remaining))
- if not data:
- raise ValueError("archive truncated")
- self.remaining -= len(data)
-
- def read(self, length=None):
- """
- @type length: int or None
- @param length: number of bytes to read from the current file
- @rtype: bytes
- @returns: length or fewer bytes from the current file
- """
- if length is None:
- length = self.remaining
- else:
- length = min(self.remaining, length)
- data = self.fileobj.read(length)
- self.remaining -= len(data)
- return data
diff --git a/dedup/debpkg.py b/dedup/debpkg.py
index 38086ec..0ecb123 100644
--- a/dedup/debpkg.py
+++ b/dedup/debpkg.py
@@ -1,8 +1,8 @@
import tarfile
+import arpy
from debian import deb822
-from dedup.arreader import ArReader
from dedup.compression import decompress
from dedup.hashing import hash_file
@@ -52,45 +52,36 @@ class DebExtractor:
@param filelike: is a file-like object containing the contents of the
Debian packge and can be read once without seeks.
"""
- af = ArReader(filelike)
- af.read_magic()
- while True:
- try:
- name = af.read_entry()
- except EOFError:
- break
- else:
- self.handle_ar_member(name, af)
+ af = arpy.Archive(fileobj=filelike)
+ for member in af:
+ self.handle_ar_member(member)
self.handle_ar_end()
- def handle_ar_member(self, name, filelike):
+ def handle_ar_member(self, arfiledata: arpy.ArchiveFileData) -> None:
"""Handle an ar archive member of the Debian package.
If you replace this method, you must also replace handle_ar_end and
none of the methods handle_debversion, handle_control_tar or
handle_data_tar are called.
- @type name: bytes
- @param name: is the name of the member
- @param filelike: is a file-like object containing the contents of the
- member and can be read once without seeks.
"""
+ name = arfiledata.header.name
if self.arstate == "start":
if name != b"debian-binary":
raise ValueError("debian-binary not found")
- version = filelike.read()
+ version = arfiledata.read()
self.handle_debversion(version)
if not version.startswith(b"2."):
raise ValueError("debian version not recognized")
self.arstate = "version"
elif self.arstate == "version":
if name.startswith(b"control.tar"):
- filelike = decompress(filelike, name[11:].decode("ascii"))
+ filelike = decompress(arfiledata, name[11:].decode("ascii"))
self.handle_control_tar(opentar(filelike))
self.arstate = "control"
elif not name.startswith(b"_"):
raise ValueError("unexpected ar member %r" % name)
elif self.arstate == "control":
if name.startswith(b"data.tar"):
- filelike = decompress(filelike, name[8:].decode("ascii"))
+ filelike = decompress(arfiledata, name[8:].decode("ascii"))
self.handle_data_tar(opentar(filelike))
self.arstate = "data"
elif not name.startswith(b"_"):