summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2017-09-23 10:33:43 +0200
committerHelmut Grohne <helmut@subdivi.de>2017-09-23 10:33:43 +0200
commit5df1185e5fa0830b546b4ef6af3cdadc655c16c8 (patch)
tree902c8fe30a19c6716ff4291a40b602d32c88b5f5
parenta9647ababd30925dc6c15f330a9158d94556cae5 (diff)
downloaddebian-dedup-5df1185e5fa0830b546b4ef6af3cdadc655c16c8.tar.gz
add module dedup.filemagic
This module is not used anywhere and thus its dependency on python3-magic is not recorded in the README. It can be used to guess the file type by looking at the contents using file magic. It is not a typical hash function, but it can be used for repurposing dedup for other analysers.
-rw-r--r--dedup/filemagic.py43
1 files changed, 43 insertions, 0 deletions
diff --git a/dedup/filemagic.py b/dedup/filemagic.py
new file mode 100644
index 0000000..4cc9357
--- /dev/null
+++ b/dedup/filemagic.py
@@ -0,0 +1,43 @@
+"""A very strange "hash" that uses the magic module (python3-magic) to guess
+the file type."""
+
+import magic
+
+class FileDigester(object):
+ """A hashlib-like class to guess a filetype using the magic module."""
+ FILE_BYTES_MAX = 1024 * 1024 # copied from file source
+
+ def __init__(self):
+ self.buff = b""
+ self.identification = None
+
+ def _compute_identification(self):
+ try:
+ return magic.none_magic.buffer(self.buff)
+ except UnicodeDecodeError:
+ return "magic identification is not valid UTF-8"
+
+ def update(self, buff):
+ if self.identification:
+ return
+ self.buff += buff
+ if len(self.buff) >= self.FILE_BYTES_MAX:
+ self.identification = self._compute_identification()
+ self.buff = None
+
+ def identify(self):
+ """Return the guessed file magic identification."""
+ if self.identification:
+ return self.identification
+ return self._compute_identification()
+
+ def hexdigest(self):
+ """Compatibility with hashlib. An alias of identify. Doesn't return
+ hex."""
+ return self.identify()
+
+ def copy(self):
+ new = FileDigester()
+ new.buff = self.buff
+ new.identification = self.identification
+ return new