summaryrefslogtreecommitdiff
path: root/dedup/filemagic.py
diff options
context:
space:
mode:
Diffstat (limited to 'dedup/filemagic.py')
-rw-r--r--dedup/filemagic.py49
1 files changed, 49 insertions, 0 deletions
diff --git a/dedup/filemagic.py b/dedup/filemagic.py
new file mode 100644
index 0000000..b71c276
--- /dev/null
+++ b/dedup/filemagic.py
@@ -0,0 +1,49 @@
+"""A very strange "hash" that uses the magic module (python3-magic) to guess
+the file type."""
+
+import magic
+
+# It changed API a few times...
+try:
+ _magic_identify = magic.from_buffer
+except AttributeError:
+ _magic_identify = magic.none_magic.buffer
+
+class FileDigester:
+ """A hashlib-like class to guess a filetype using the magic module."""
+ FILE_BYTES_MAX = 1024 * 1024 # copied from file source
+
+ def __init__(self):
+ self.buff = b""
+ self.identification = None
+
+ def _compute_identification(self):
+ try:
+ return _magic_identify(self.buff)
+ except UnicodeDecodeError:
+ return "magic identification is not valid UTF-8"
+
+ def update(self, buff):
+ if self.identification:
+ return
+ self.buff += buff
+ if len(self.buff) >= self.FILE_BYTES_MAX:
+ self.identification = self._compute_identification()
+ self.buff = None
+
+ def identify(self):
+ """Return the guessed file magic identification."""
+ if self.identification:
+ return self.identification
+ return self._compute_identification()
+
+ def hexdigest(self):
+ """Compatibility with hashlib. An alias of identify. Doesn't return
+ hex."""
+ return self.identify()
+
+ def copy(self):
+ new = FileDigester()
+ new.buff = self.buff
+ new.identification = self.identification
+ return new