Source code for idtxl.bivariate_pid

"""Estimate partial information decomposition (PID).

Estimate PID for two source and one target process using different estimators.

Note:
    Written for Python 3.4+
"""
import numpy as np
from .single_process_analysis import SingleProcessAnalysis
from .estimator import get_estimator
from .results import ResultsPID


[docs]class BivariatePID(SingleProcessAnalysis):
    """Perform partial information decomposition for individual processes.

    Perform partial information decomposition (PID) for two source processes
    and one target process in the network. Estimate unique, shared, and
    synergistic information in the two sources about the target. Call
    analyse_network() on the whole network or a set of nodes or call
    analyse_single_target() to estimate PID for a single process. See
    docstrings of the two functions for more information.

    References:

    - Williams, P. L., & Beer, R. D. (2010). Nonnegative Decomposition of
      Multivariate Information, 1–14. Retrieved from
      http://arxiv.org/abs/1004.2515
    - Bertschinger, N., Rauh, J., Olbrich, E., Jost, J., & Ay, N. (2014).
      Quantifying Unique Information. Entropy, 16(4), 2161–2183.
      http://doi.org/10.3390/e16042161

    Attributes:
        target : int
            index of target process
        sources : array type
            pair of indices of source processes
        settings : dict
            analysis settings
        results : dict
            estimated PID
    """

    def __init__(self):
        self._pid_estimator = None
        self.sources = None
        self.target = None
        self.results = None
        super().__init__()

[docs]    def analyse_network(self, settings, data, targets, sources):
        """Estimate partial information decomposition for network nodes.

        Estimate partial information decomposition (PID) for multiple nodes in
        the network.

        Note:
            For a detailed description of the algorithm and settings see
            documentation of the analyse_single_target() method and
            references in the class docstring.

        Example:

            >>> n = 20
            >>> alph = 2
            >>> x = np.random.randint(0, alph, n)
            >>> y = np.random.randint(0, alph, n)
            >>> z = np.logical_xor(x, y).astype(int)
            >>> data = Data(np.vstack((x, y, z)), 'ps', normalise=False)
            >>> settings = {
            >>>     'lags_pid': [[1, 1], [3, 2], [0, 0]],
            >>>     'alpha': 0.1,
            >>>     'alph_s1': alph,
            >>>     'alph_s2': alph,
            >>>     'alph_t': alph,
            >>>     'max_unsuc_swaps_row_parm': 60,
            >>>     'num_reps': 63,
            >>>     'max_iters': 1000,
            >>>     'pid_estimator': 'SydneyPID'}
            >>> targets = [0, 1, 2]
            >>> sources = [[1, 2], [0, 2], [0, 1]]
            >>> pid_analysis = BivariatePID()
            >>> results = pid_analysis.analyse_network(settings, data, targets,
            >>>                                        sources)

        Args:
            settings : dict
                parameters for estimation and statistical testing, see
                documentation of analyse_single_target() for details, can
                contain

                - lags_pid : list of lists of ints [optional] - lags in samples
                  between sources and target (default=[[1, 1], [1, 1] ...])

            data : Data instance
                raw data for analysis
            targets : list of int
                index of target processes
            sources : list of lists
                indices of the two source processes for each target, e.g.,
                [[0, 2], [1, 0]], must have the same length as targets

        Returns:
            ResultsPID instance
                results of network inference, see documentation of
                ResultsPID()
        """
        # Set defaults for PID estimation.
        settings.setdefault("verbose", True)
        settings.setdefault("lags_pid", np.array([[1, 1]] * len(targets)))

        # Check inputs.
        if not len(targets) == len(sources) == len(settings["lags_pid"]):
            raise RuntimeError(
                "Lists of targets, sources, and lags must have" "the same lengths."
            )
        list_of_lags = settings["lags_pid"]

        # Perform PID estimation for each target individually
        results = ResultsPID(
            n_nodes=data.n_processes,
            n_realisations=data.n_realisations(),
            normalised=data.normalise,
        )
        for t, target in enumerate(targets):
            if settings["verbose"]:
                print(f"\n####### analysing target with index {t} from list {targets}")
            settings["lags_pid"] = list_of_lags[t]
            res_single = self.analyse_single_target(settings, data, target, sources[t])
            results.combine_results(res_single)
        # Get no. realisations actually used for estimation from single target
        # analysis.
        results.data_properties.n_realisations = (
            res_single.data_properties.n_realisations
        )
        return results

[docs]    def analyse_single_target(self, settings, data, target, sources):
        """Estimate partial information decomposition for a network node.

        Estimate partial information decomposition (PID) for a target node in
        the network.

        Note:
            For a description of the algorithm and the method see references in
            the class and estimator docstrings.

        Example:

            >>> n = 20
            >>> alph = 2
            >>> x = np.random.randint(0, alph, n)
            >>> y = np.random.randint(0, alph, n)
            >>> z = np.logical_xor(x, y).astype(int)
            >>> data = Data(np.vstack((x, y, z)), 'ps', normalise=False)
            >>> settings = {
            >>>     'alpha': 0.1,
            >>>     'alph_s1': alph,
            >>>     'alph_s2': alph,
            >>>     'alph_t': alph,
            >>>     'max_unsuc_swaps_row_parm': 60,
            >>>     'num_reps': 63,
            >>>     'max_iters': 1000,
            >>>     'pid_calc_name': 'SydneyPID',
            >>>     'lags_pid': [2, 3]}
            >>> pid_analysis = BivariatePID()
            >>> results = pid_analysis.analyse_single_target(settings=settings,
            >>>                                              data=data,
            >>>                                              target=0,
            >>>                                              sources=[1, 2])

        Args: settings : dict parameters for estimator use and statistics:

                - pid_estimator : str - estimator to be used for PID estimation
                  (for estimator settings see the documentation in the
                  estimators_pid modules)
                - lags_pid : list of ints [optional] - lags in samples between
                  sources and target (default=[1, 1])
                - verbose : bool [optional] - toggle console output
                  (default=True)

            data : Data instance
                raw data for analysis
            target : int
                index of target processes
            sources : list of ints
                indices of the two source processes for the target

        Returns: ResultsPID instance results of
            network inference, see documentation of
            ResultsPID()
        """
        # Check input and initialise values for analysis.
        self._initialise(settings, data, target, sources)

        # Estimate PID and significance.
        self._calculate_pid(data)

        # Add analysis info.
        results = ResultsPID(
            n_nodes=data.n_processes,
            n_realisations=data.n_realisations(self.current_value),
            normalised=data.normalise,
        )
        results._add_single_result(
            settings=self.settings, target=self.target, results=self.results
        )
        self._reset()
        return results

    def _initialise(self, settings, data, target, sources):
        """Check input, set initial or default values for analysis settings."""
        # Check requested PID estimator.
        assert "pid_estimator" in settings, "Estimator was not specified!"
        self._pid_estimator = get_estimator(settings["pid_estimator"], settings)

        self.settings = settings.copy()
        self.settings.setdefault("lags_pid", [1, 1])
        self.settings.setdefault("verbose", True)

        # Check if provided lags are correct and work with the number of
        # samples in the data.
        if len(self.settings["lags_pid"]) != 2:
            raise RuntimeError("List of lags must have length 2.")
        if self.settings["lags_pid"][0] >= data.n_samples:
            raise RuntimeError(
                f"Lag 1 ({self.settings['lags_pid'][0]}) is larger than the number of "
                f"samples in the data set ({data.n_samples})."
            )
        if self.settings["lags_pid"][1] >= data.n_samples:
            raise RuntimeError(
                f"Lag 2 ({self.settings['lags_pid'][1]}) is larger than the number of "
                "samples in the data set ({data.n_samples})."
            )

        # Check if target and sources are provided correctly.
        if not isinstance(target, int):
            raise RuntimeError("Target must be an integer.")
        if len(sources) != 2:
            raise RuntimeError("List of sources must have length 2.")
        if target in sources:
            raise RuntimeError(
                f"The target ({target}) should not be in the list of sources ({sources})."
            )

        self.current_value = (target, max(self.settings["lags_pid"]))
        self.target = target
        # TODO works for single vars only, change to multivariate?
        self.sources = self._lag_to_idx(
            [
                (sources[0], self.settings["lags_pid"][0]),
                (sources[1], self.settings["lags_pid"][1]),
            ]
        )

    def _calculate_pid(self, data):
        # TODO Discuss how and if the following statistical testing should be
        # included included. Remove dummy results.
        # [orig_pid, sign_1, p_val_1,
        #  sign_2, p_val_2] = stats.unq_against_surrogates(self, data)
        # [orig_pid, sign_shd,
        #  p_val_shd, sign_syn, p_val_syn] = stats.syn_shd_against_surrogates(
        #                                                                 self,
        # sign_1 = sign_2 = sign_shd = sign_syn = False
        # p_val_1 = p_val_2 = p_val_shd = p_val_syn = 1.0

        target_realisations = data.get_realisations(
            self.current_value, [self.current_value]
        )[0]
        source_1_realisations = data.get_realisations(
            self.current_value, [self.sources[0]]
        )[0]
        source_2_realisations = data.get_realisations(
            self.current_value, [self.sources[1]]
        )[0]
        orig_pid = self._pid_estimator.estimate(
            s1=source_1_realisations, s2=source_2_realisations, t=target_realisations
        )

        if self.settings["verbose"]:
            print(
                f"\nunq information s1: {orig_pid['unq_s1']:.8f}, s2: {orig_pid['unq_s2']:.8f}"
            )
            print(
                f"shd information: {orig_pid['shd_s1_s2']:.8f}, syn information: {orig_pid['syn_s1_s2']:.8f}"
            )
        self.results = orig_pid
        self.results["source_1"] = self._idx_to_lag([self.sources[0]])
        self.results["source_2"] = self._idx_to_lag([self.sources[1]])
        self.results["selected_vars_sources"] = [
            self.results["source_1"][0],
            self.results["source_2"][0],
        ]
        self.results["current_value"] = self.current_value
        # self.results['unq_s1_sign'] = sign_1
        # self.results['unq_s2_sign'] = sign_2
        # self.results['unq_s1_p_val'] = p_val_1
        # self.results['unq_s2_p_val'] = p_val_2
        # self.results['syn_sign'] = sign_syn
        # self.results['syn_p_val'] = p_val_syn
        # self.results['shd_sign'] = sign_shd
        # self.results['shd_p_val'] = p_val_shd

        # TODO make mi_against_surrogates in stats more generic, such that
        # it becomes an arbitrary permutation test where one argument gets
        # shuffled and then all arguments are passed to the provided estimator

    def _reset(self):
        """Reset instance after analysis."""
        self.__init__()
        del self.results
        del self.settings
        del self._pid_estimator