summaryrefslogtreecommitdiff
path: root/dedup/arreader.py
blob: d74ae3716ca2c4961cc43ce47578adc7348aad21 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import struct

class ArReader(object):
    """Streaming AR file reader. After constructing an object, you usually
    call read_magic once. Then you call read_entry in a loop and use the
    ArReader object as file-like only providing read() to read the respective
    file contents until you get EOFError from read_entry.
    """
    global_magic = b"!<arch>\n"
    file_magic = b"`\n"

    def __init__(self, fileobj):
        """
        @param fileobj: a file-like object providing nothing but read(length)
        """
        self.fileobj = fileobj
        self.remaining = None
        self.padding = 0

    def read_magic(self):
        """Consume the AR magic marker at the beginning of an AR file. You
        must not call any other method before calling this method.
        @raises ValueError: if the magic is not found
        """
        data = self.fileobj.read(len(self.global_magic))
        if data != self.global_magic:
            raise ValueError("ar global header not found")
        self.remaining = 0

    def read_entry(self):
        """Read the next file header, return the filename and record the
        length of the next file, so that the read method can be used to
        exhaustively read the current file.
        @rtype: bytes
        @returns: the name of the next file
        @raises ValueError: if the data format is wrong
        @raises EOFError: when the end f the stream is reached
        """
        self.skip_current_entry()
        if self.padding:
            if self.fileobj.read(1) != b'\n':
                raise ValueError("missing ar padding")
            self.padding = 0
        file_header = self.fileobj.read(60)
        if not file_header:
            raise EOFError("end of archive found")
        parts = struct.unpack("16s 12s 6s 6s 8s 10s 2s", file_header)
        parts = [p.rstrip(b" ") for p in parts]
        if parts.pop() != self.file_magic:
            raise ValueError("ar file header not found")
        self.remaining = int(parts[5])
        self.padding = self.remaining % 2
        return parts[0] # name

    def skip_current_entry(self):
        """Skip the remainder of the current file. This method must not be
        called before calling read_entry.
        @raises ValueError: if the archive appears truncated
        """
        while self.remaining:
            data = self.fileobj.read(min(4096, self.remaining))
            if not data:
                raise ValueError("archive truncated")
            self.remaining -= len(data)

    def read(self, length=None):
        """
        @type length: int or None
        @param length: number of bytes to read from the current file
        @rtype: bytes
        @returns: length or fewer bytes from the current file
        """
        if length is None:
            length = self.remaining
        else:
            length = min(self.remaining, length)
        data = self.fileobj.read(length)
        self.remaining -= len(data)
        return data