Source code for blocklib.encoding

"""Class to implement privacy preserving encoding."""
import hashlib
import numpy as np
from typing import List, Set


[docs]def flip_bloom_filter(string: str, bf_len: int, num_hash_funct: int): """ Hash string and return indices of bits that have been flipped correspondingly. :param string: string: to be hashed and to flip bloom filter :param bf_len: int: length of bloom filter :param num_hash_funct: int: number of hash functions :return: bfset: a set of integers - indices that have been flipped to 1 """ # config for hashing h1 = hashlib.sha1 h2 = hashlib.md5 sha_bytes = h1(string.encode('utf-8')).digest() md5_bytes = h2(string.encode('utf-8')).digest() int1 = int.from_bytes(sha_bytes, 'big') % bf_len int2 = int.from_bytes(md5_bytes, 'big') % bf_len # flip {num_hash_funct} times bfset = set() for i in range(num_hash_funct): gi = (int1 + i * int2) % bf_len bfset.add(gi) return bfset
[docs]def generate_bloom_filter(list_of_strs: List[str], bf_len: int, num_hash_funct: int): """ Generate a bloom filter given list of strings. :param return_cbf_index_sig_map: :param list_of_strs: :param bf_len: :param num_hash_funct: :return: bloom_filter_vector if return_cbf_index_sig_map is False else (bloom_filter_vector, cbf_index_sig_map) """ # go through each signature and generate bloom filter of it # -- we only store the set of index that flipped to 1 candidate_bloom_filter = set() # type: Set[int] for signature in list_of_strs: bfset = flip_bloom_filter(signature, bf_len, num_hash_funct) # union indices that have been flipped 1 in candidate bf candidate_bloom_filter = candidate_bloom_filter.union(bfset) # massage the cbf into a numpy bool array from a set bloom_filter_vector = np.zeros(bf_len, dtype=bool) bloom_filter_vector[list(candidate_bloom_filter)] = True return bloom_filter_vector