summaryrefslogtreecommitdiff
path: root/dedup/hashing.py
blob: 0c786e1505071755c297dc93c86b9fc601f9a372 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import io
import os

class HashBlacklist(object):
    """Turn a hashlib-like object into a hash that returns None for some
    blacklisted hashes instead of the real hash value.

    We only work with hexdigests here, so diget() disappears. The methods
    copy and update as well as the name attribute keep working as expected.
    """
    def __init__(self, hashobj, blacklist=()):
        """
        @param hashobj: a hashlib-like object
        @param blacklist: an object providing __contains__.
            hexdigest values which are contained in the blacklist
            are turned into None values
        """
        self.hashobj = hashobj
        self.blacklist = blacklist
        self.update = self.hashobj.update

    @property
    def name(self):
        return self.hashobj.name

    def hexdigest(self):
        digest = self.hashobj.hexdigest()
        if digest in self.blacklist:
            return None
        return digest

    def copy(self):
        return HashBlacklist(self.hashobj.copy(), self.blacklist)

class DecompressedHash(object):
    """Apply a decompression function before the hash. This class provides the
    hashlib interface (update, hexdigest, copy) excluding digest and name."""
    def __init__(self, decompressor, hashobj):
        """
        @param decompressor: a decompression object like bz2.BZ2Decompressor or
            lzma.LZMADecompressor. It has to provide methods decompress and
            copy as well as an unused_data attribute. It may provide a flush
            method.
        @param hashobj: a hashlib-like obj providing methods update, hexdigest
            and copy
        """
        self.decompressor = decompressor
        self.hashobj = hashobj

    def update(self, data):
        self.hashobj.update(self.decompressor.decompress(data))

    def hexdigest(self):
        if not hasattr(self.decompressor, "flush"):
            return self.hashobj.hexdigest()
        tmpdecomp = self.decompressor.copy()
        data = tmpdecomp.flush()
        tmphash = self.hashobj.copy()
        tmphash.update(data)
        return tmphash.hexdigest()

    def copy(self):
        return DecompressedHash(self.decompressor.copy(), self.hashobj.copy())

class SuppressingHash(object):
    """A hash that silences exceptions from the update and hexdigest methods of
    a hashlib-like object. If an exception has occured, hexdigest always
    returns None."""
    def __init__(self, hashobj, exceptions=()):
        """
        @param hashobj: a hashlib-like object providing methods update, copy
            and hexdigest. If a name attribute is present, it is mirrored as
            well.
        @type exceptions: tuple
        @param exceptions: exception classes to be suppressed
        """
        self.hashobj = hashobj
        self.exceptions = exceptions
        if hasattr(hashobj, "name"):
            self.name = hashobj.name

    def update(self, data):
        if self.hashobj:
            try:
                self.hashobj.update(data)
            except self.exceptions:
                self.hashobj = None

    def hexdigest(self):
        if self.hashobj:
            try:
                return self.hashobj.hexdigest()
            except self.exceptions:
                self.hashobj = None
        return None

    def copy(self):
        if self.hashobj:
            return SuppressingHash(self.hashobj.copy(), self.exceptions)
        return SuppressingHash(None, self.exceptions)

class StoredHash(object):
    """A hash that stores all the updates and then does all the work on the
    hexdigest call."""

    def __init__(self, digestfunc, acceptfunc=lambda _: True,
                 sizelimit=1024*1024*16):
        """
        @type digestfunc: file-like -> str
        @param digestfunc: should read the given file-like and return
            the computed hash. The file-like can be assumed to be seekable.
        @type acceptfunc: file-like -> bool or None
            should read enough of file-like to determine whether the hash
            is computable. To accept the hash, return True. To reject
            producing a hash return False. To defer the decision until more
            data is available return None.
        @type sizelimit: int
        @param sizelimit: when the content exceeds this size, reject it
        """
        self.digestfunc = digestfunc
        self.acceptfunc = acceptfunc
        self.sizelimit = sizelimit
        self.content = io.BytesIO()
        self.accepted = False

    def update(self, data):
        if self.content is None or not data:
            return
        self.content.seek(0, os.SEEK_END)
        if self.content.tell() + len(data) > self.sizelimit:
            self.content = None
            return
        self.content.write(data)
        if not self.accepted:
            self.content.seek(0, os.SEEK_SET)
            ret = self.acceptfunc(self.content)
            if ret is None:
                return
            if ret:
                self.accepted = True
            else:
                self.content = None

    def hexdigest(self):
        if not self.content or not self.accepted:
            return None
        self.content.seek(0, os.SEEK_SET)
        return self.digestfunc(self.content)

    def copy(self):
        new = StoredHash(self.digestfunc, self.acceptfunc, self.sizelimit)
        if self.content:
            new.content = io.BytesIO(self.content.getvalue())
        else:
            new.content = None
        new.accepted = self.accepted
        return new

def hash_file(hashobj, filelike, blocksize=65536):
    """Feed the entire contents from the given filelike to the given hashobj.
    @param hashobj: hashlib-like object providing an update method
    @param filelike: file-like object providing read(size)
    """
    data = filelike.read(blocksize)
    while data:
        hashobj.update(data)
        data = filelike.read(blocksize)
    return hashobj