Source code for blocklib.candidate_blocks_generator

import sys

from typing import Dict, Sequence, Tuple, Type, List, Optional, TextIO
from .pprlindex import PPRLIndex, ReversedIndexResult
from .pprlpsig import PPRLIndexPSignature
from .pprllambdafold import PPRLIndexLambdaFold
from .validation import validate_blocking_schema


PPRLSTATES = {
    "p-sig": PPRLIndexPSignature,
    "lambda-fold": PPRLIndexLambdaFold,
}  # type: Dict[str, Type[PPRLIndex]]


[docs]class CandidateBlockingResult:
    """Object for holding candidate blocking results.

    :ivar blocks: a dictionary that contains a mapping from the block ID to the record IDs in that block.
    :ivar state: A PPRLIndex state that contains the configuration of blocking
    :ivar stats: a dictionary containing the summary statistics of the generated blocks"""

    def __init__(self, blocking_result: ReversedIndexResult, state: PPRLIndex):
        """
        Initialise a blocking result object.
        :param blocking_result: A ReversedIndexResult object, containing the blocks and corresponding statistics
        :param state: A PPRLIndex state that contains configuration of blocking
        """
        self.blocks = blocking_result.reversed_index
        self.state = state
        self.stats = blocking_result.stats

[docs]    def print_summary_statistics(self, output: TextIO = sys.stdout, round_ndigits: int = 4):
        """
        Print the summary statistics of this candidate blocking result to 'output'.
        :param output: a file like object to write to. Defaults to sys.stdout
        :param round_ndigits: round floating point numbers to ndigits precision. Defaults to 4.
        """
        def print_stats(stats: Dict, out: TextIO):
            out.write('\tNumber of Blocks:   {}\n'.format(stats['num_of_blocks']))
            out.write('\tMinimum Block Size: {}\n'.format(stats['min_size']))
            out.write('\tMaximum Block Size: {}\n'.format(stats['max_size']))
            out.write('\tAverage Block Size: {}\n'.format(round(stats['avg_size'], round_ndigits)))
            out.write('\tMedian Block Size:  {}\n'.format(stats['med_size']))
            out.write('\tStandard Deviation of Block Size:  {}\n'.format(round(stats['std_size'], round_ndigits)))
            if 'coverage' in stats:
                out.write('\tCoverage:           {}%\n'.format(round(stats['coverage'] * 100, 2)))

        output.write('Statistics for the generated blocks:\n')
        print_stats(self.stats, output)
        if 'statistics_per_strategy' in self.stats:
            output.write('Individual statistics for each strategy:\n')
            for stat in self.stats['statistics_per_strategy']:
                output.write('Strategy: {}\n'.format(stat['strategy_idx']))
                print_stats(stat, output)


[docs]def generate_candidate_blocks(data: Sequence[Tuple[str, ...]],
                              blocking_schema: Dict,
                              header: Optional[List[str]] = None) -> CandidateBlockingResult:
    """
    :param data: list of tuples E.g. ('0', 'Kenneth Bain', '1964/06/17', 'M')
    :param blocking_schema:
        A description of how the signatures should be generated.
        See :ref:`blocking-schema`
    :param header: column names (optional)
        Program should throw exception if block features are string but header is None

    :return: A 2-tuple containing
        A list of "signatures" per record in data.
        Internal state object from the signature generation (or None).
    """
    blocking_model = validate_blocking_schema(blocking_schema)

    # extract algorithm and its config
    algorithm = blocking_model.type.value
    config = blocking_model.config

    # check if blocking features are column index or feature name
    blocking_features = config.blocking_features
    feature_type = type(blocking_features[0])
    error_msg = 'All feature types should be the same - either feature name or feature index'
    assert all(type(x) == feature_type for x in blocking_features[1:]), error_msg

    # header should not be None if blocking features are string
    if feature_type == str:
        assert header, 'Header must not be None if blocking features are string'

    if algorithm in PPRLSTATES:
        state = PPRLSTATES[algorithm](config)
        reversed_index_result = state.build_reversed_index(data, header)
        candidate_block_obj = CandidateBlockingResult(reversed_index_result, state)

    else:
        raise NotImplementedError('The algorithm {} is not supported yet'.format(algorithm))

    return candidate_block_obj