summaryrefslogtreecommitdiff
path: root/dedup/filemagic.py
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2021-12-31 15:45:33 +0100
committerHelmut Grohne <helmut@subdivi.de>2021-12-31 15:45:33 +0100
commitf3ea68482e6c01053cb202573d953e8a2e89529f (patch)
tree4c08f6e5a99bbe5131c0949e7f97cc44cf4a2cbd /dedup/filemagic.py
parentf2eda3ba74e5bc5613e84381ebd8bfd343e1c8cc (diff)
parent5b359b10053cbade539246eec26e86b44793ca40 (diff)
downloaddebian-dedup-f3ea68482e6c01053cb202573d953e8a2e89529f.tar.gz
Merge branch master into branch multiarchhints
Among other things, this drops Python 2.x support.
Diffstat (limited to 'dedup/filemagic.py')
-rw-r--r--dedup/filemagic.py49
1 files changed, 49 insertions, 0 deletions
diff --git a/dedup/filemagic.py b/dedup/filemagic.py
new file mode 100644
index 0000000..b71c276
--- /dev/null
+++ b/dedup/filemagic.py
@@ -0,0 +1,49 @@
+"""A very strange "hash" that uses the magic module (python3-magic) to guess
+the file type."""
+
+import magic
+
+# It changed API a few times...
+try:
+ _magic_identify = magic.from_buffer
+except AttributeError:
+ _magic_identify = magic.none_magic.buffer
+
+class FileDigester:
+ """A hashlib-like class to guess a filetype using the magic module."""
+ FILE_BYTES_MAX = 1024 * 1024 # copied from file source
+
+ def __init__(self):
+ self.buff = b""
+ self.identification = None
+
+ def _compute_identification(self):
+ try:
+ return _magic_identify(self.buff)
+ except UnicodeDecodeError:
+ return "magic identification is not valid UTF-8"
+
+ def update(self, buff):
+ if self.identification:
+ return
+ self.buff += buff
+ if len(self.buff) >= self.FILE_BYTES_MAX:
+ self.identification = self._compute_identification()
+ self.buff = None
+
+ def identify(self):
+ """Return the guessed file magic identification."""
+ if self.identification:
+ return self.identification
+ return self._compute_identification()
+
+ def hexdigest(self):
+ """Compatibility with hashlib. An alias of identify. Doesn't return
+ hex."""
+ return self.identify()
+
+ def copy(self):
+ new = FileDigester()
+ new.buff = self.buff
+ new.identification = self.identification
+ return new