Source code for blocklib.pprlindex

import random
from typing import Any, Dict, List, Sequence, Optional, Union, cast
import logging
from pydantic.tools import parse_obj_as

from blocklib.configuration import get_config
from blocklib.utils import check_header
from blocklib.validation import PPRLIndexConfig


[docs]class PPRLIndex: """Base class for PPRL indexing/blocking.""" def __init__(self, config: PPRLIndexConfig) -> None: """Initialise base class.""" self.config: PPRLIndexConfig = cast(PPRLIndexConfig, config) self.rec_dict = None self.ent_id_col = None self.rec_id_col: Optional[int] = None
[docs] def get_feature_to_index_map(self, data: Sequence[Sequence], header: Optional[List[str]] = None): """Return feature name to feature index mapping if there is a header and feature is of type string.""" feature_type = type(self.blocking_features[0]) # type: ignore feature_to_index = None if len(data) == 0: return feature_to_index tuple_type = type(data[0]) # if data is CLKs, then tuple_type will be str, otherwise a tuple if header and feature_type == str and tuple_type != str: check_header(header, data[0]) feature_to_index = {name: ind for ind, name in enumerate(header)} return feature_to_index
[docs] def set_blocking_features_index(self, blocking_features, feature_to_index: Optional[Dict[str, int]] = None): """Set value of member variable blocking features index. self.blocking_features could be string (column name) or int (column index) self.blocking_features_index must be int (column index) """ if feature_to_index: self.blocking_features_index = [feature_to_index[x] for x in blocking_features] else: self.blocking_features_index = blocking_features
[docs] def build_reversed_index(self, data: Sequence[Sequence], header: Optional[List[str]] = None): """Method which builds the index for all database. :param data: list of tuples, PII dataset :param header: file header, optional :rtype: ReversedIndexResult See derived classes for actual implementations. """ raise NotImplementedError("Derived class needs to implement")
[docs] @classmethod def select_reference_value(cls, reference_data: Sequence[Sequence], ref_data_config: Dict): """Load reference data for methods need reference.""" # read configurations ref_default_features = get_config(ref_data_config, 'blocking-features') ref_random_seed = get_config(ref_data_config, 'random-state') num_vals = get_config(ref_data_config, 'num-reference-values') # extract features in config rec_features = [''.join([dtuple[x] for x in ref_default_features]) for dtuple in reference_data] # generate reference values random.seed(ref_random_seed) ref_val_list = random.sample(rec_features, num_vals) logging.info('Selected %d random reference values' % (len(ref_val_list))) return ref_val_list
class ReversedIndexResult(object): def __init__(self, reversed_index: Dict, stats: Dict): self.reversed_index = reversed_index self.stats = stats def __eq__(self, other): return self.reversed_index == other.reversed_index and self.stats == other.stats