summaryrefslogtreecommitdiff
path: root/dedup/filemagic.py
blob: b71c276797b807e6397a7473d09a15df0c3331d0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
"""A very strange "hash" that uses the magic module (python3-magic) to guess
the file type."""

import magic

# It changed API a few times...
try:
    _magic_identify = magic.from_buffer
except AttributeError:
    _magic_identify = magic.none_magic.buffer

class FileDigester:
    """A hashlib-like class to guess a filetype using the magic module."""
    FILE_BYTES_MAX = 1024 * 1024 # copied from file source

    def __init__(self):
        self.buff = b""
        self.identification = None

    def _compute_identification(self):
        try:
            return _magic_identify(self.buff)
        except UnicodeDecodeError:
            return "magic identification is not valid UTF-8"

    def update(self, buff):
        if self.identification:
            return
        self.buff += buff
        if len(self.buff) >= self.FILE_BYTES_MAX:
            self.identification = self._compute_identification()
            self.buff = None

    def identify(self):
        """Return the guessed file magic identification."""
        if self.identification:
            return self.identification
        return self._compute_identification()

    def hexdigest(self):
        """Compatibility with hashlib. An alias of identify. Doesn't return
        hex."""
        return self.identify()

    def copy(self):
        new = FileDigester()
        new.buff = self.buff
        new.identification = self.identification
        return new