diff options
author | Helmut Grohne <helmut@subdivi.de> | 2017-09-23 10:33:43 +0200 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2017-09-23 10:33:43 +0200 |
commit | 5df1185e5fa0830b546b4ef6af3cdadc655c16c8 (patch) | |
tree | 902c8fe30a19c6716ff4291a40b602d32c88b5f5 | |
parent | a9647ababd30925dc6c15f330a9158d94556cae5 (diff) | |
download | debian-dedup-5df1185e5fa0830b546b4ef6af3cdadc655c16c8.tar.gz |
add module dedup.filemagic
This module is not used anywhere and thus its dependency on
python3-magic is not recorded in the README. It can be used to guess the
file type by looking at the contents using file magic. It is not a
typical hash function, but it can be used for repurposing dedup for
other analysers.
-rw-r--r-- | dedup/filemagic.py | 43 |
1 files changed, 43 insertions, 0 deletions
diff --git a/dedup/filemagic.py b/dedup/filemagic.py new file mode 100644 index 0000000..4cc9357 --- /dev/null +++ b/dedup/filemagic.py @@ -0,0 +1,43 @@ +"""A very strange "hash" that uses the magic module (python3-magic) to guess +the file type.""" + +import magic + +class FileDigester(object): + """A hashlib-like class to guess a filetype using the magic module.""" + FILE_BYTES_MAX = 1024 * 1024 # copied from file source + + def __init__(self): + self.buff = b"" + self.identification = None + + def _compute_identification(self): + try: + return magic.none_magic.buffer(self.buff) + except UnicodeDecodeError: + return "magic identification is not valid UTF-8" + + def update(self, buff): + if self.identification: + return + self.buff += buff + if len(self.buff) >= self.FILE_BYTES_MAX: + self.identification = self._compute_identification() + self.buff = None + + def identify(self): + """Return the guessed file magic identification.""" + if self.identification: + return self.identification + return self._compute_identification() + + def hexdigest(self): + """Compatibility with hashlib. An alias of identify. Doesn't return + hex.""" + return self.identify() + + def copy(self): + new = FileDigester() + new.buff = self.buff + new.identification = self.identification + return new |