diff options
Diffstat (limited to 'dedup/filemagic.py')
-rw-r--r-- | dedup/filemagic.py | 43 |
1 files changed, 43 insertions, 0 deletions
diff --git a/dedup/filemagic.py b/dedup/filemagic.py new file mode 100644 index 0000000..4cc9357 --- /dev/null +++ b/dedup/filemagic.py @@ -0,0 +1,43 @@ +"""A very strange "hash" that uses the magic module (python3-magic) to guess +the file type.""" + +import magic + +class FileDigester(object): + """A hashlib-like class to guess a filetype using the magic module.""" + FILE_BYTES_MAX = 1024 * 1024 # copied from file source + + def __init__(self): + self.buff = b"" + self.identification = None + + def _compute_identification(self): + try: + return magic.none_magic.buffer(self.buff) + except UnicodeDecodeError: + return "magic identification is not valid UTF-8" + + def update(self, buff): + if self.identification: + return + self.buff += buff + if len(self.buff) >= self.FILE_BYTES_MAX: + self.identification = self._compute_identification() + self.buff = None + + def identify(self): + """Return the guessed file magic identification.""" + if self.identification: + return self.identification + return self._compute_identification() + + def hexdigest(self): + """Compatibility with hashlib. An alias of identify. Doesn't return + hex.""" + return self.identify() + + def copy(self): + new = FileDigester() + new.buff = self.buff + new.identification = self.identification + return new |