Source code for blocklib.candidate_blocks_generator

import sys

from typing import Dict, Sequence, Tuple, Type, List, Optional, TextIO
from .pprlindex import PPRLIndex, ReversedIndexResult
from .pprlpsig import PPRLIndexPSignature
from .pprllambdafold import PPRLIndexLambdaFold
from .validation import validate_blocking_schema


PPRLSTATES = {
    "p-sig": PPRLIndexPSignature,
    "lambda-fold": PPRLIndexLambdaFold,
}  # type: Dict[str, Type[PPRLIndex]]


[docs]class CandidateBlockingResult: """Object for holding candidate blocking results. :ivar blocks: a dictionary that contains a mapping from the block ID to the record IDs in that block. :ivar state: A PPRLIndex state that contains the configuration of blocking :ivar stats: a dictionary containing the summary statistics of the generated blocks""" def __init__(self, blocking_result: ReversedIndexResult, state: PPRLIndex): """ Initialise a blocking result object. :param blocking_result: A ReversedIndexResult object, containing the blocks and corresponding statistics :param state: A PPRLIndex state that contains configuration of blocking """ self.blocks = blocking_result.reversed_index self.state = state self.stats = blocking_result.stats
[docs] def print_summary_statistics(self, output: TextIO = sys.stdout, round_ndigits: int = 4): """ Print the summary statistics of this candidate blocking result to 'output'. :param output: a file like object to write to. Defaults to sys.stdout :param round_ndigits: round floating point numbers to ndigits precision. Defaults to 4. """ def print_stats(stats: Dict, out: TextIO): out.write('\tNumber of Blocks: {}\n'.format(stats['num_of_blocks'])) out.write('\tMinimum Block Size: {}\n'.format(stats['min_size'])) out.write('\tMaximum Block Size: {}\n'.format(stats['max_size'])) out.write('\tAverage Block Size: {}\n'.format(round(stats['avg_size'], round_ndigits))) out.write('\tMedian Block Size: {}\n'.format(stats['med_size'])) out.write('\tStandard Deviation of Block Size: {}\n'.format(round(stats['std_size'], round_ndigits))) if 'coverage' in stats: out.write('\tCoverage: {}%\n'.format(round(stats['coverage'] * 100, 2))) output.write('Statistics for the generated blocks:\n') print_stats(self.stats, output) if 'statistics_per_strategy' in self.stats: output.write('Individual statistics for each strategy:\n') for stat in self.stats['statistics_per_strategy']: output.write('Strategy: {}\n'.format(stat['strategy_idx'])) print_stats(stat, output)
[docs]def generate_candidate_blocks(data: Sequence[Tuple[str, ...]], blocking_schema: Dict, header: Optional[List[str]] = None) -> CandidateBlockingResult: """ :param data: list of tuples E.g. ('0', 'Kenneth Bain', '1964/06/17', 'M') :param blocking_schema: A description of how the signatures should be generated. See :ref:`blocking-schema` :param header: column names (optional) Program should throw exception if block features are string but header is None :return: A 2-tuple containing A list of "signatures" per record in data. Internal state object from the signature generation (or None). """ blocking_model = validate_blocking_schema(blocking_schema) # extract algorithm and its config algorithm = blocking_model.type.value config = blocking_model.config # check if blocking features are column index or feature name blocking_features = config.blocking_features feature_type = type(blocking_features[0]) error_msg = 'All feature types should be the same - either feature name or feature index' assert all(type(x) == feature_type for x in blocking_features[1:]), error_msg # header should not be None if blocking features are string if feature_type == str: assert header, 'Header must not be None if blocking features are string' if algorithm in PPRLSTATES: state = PPRLSTATES[algorithm](config) reversed_index_result = state.build_reversed_index(data, header) candidate_block_obj = CandidateBlockingResult(reversed_index_result, state) else: raise NotImplementedError('The algorithm {} is not supported yet'.format(algorithm)) return candidate_block_obj