"""Estimate partial information decomposition (PID).
Estimate PID for two source and one target process using different estimators.
Note:
Written for Python 3.4+
"""
import numpy as np
from .single_process_analysis import SingleProcessAnalysis
from .estimator import get_estimator
from .results import ResultsPID
[docs]class BivariatePID(SingleProcessAnalysis):
"""Perform partial information decomposition for individual processes.
Perform partial information decomposition (PID) for two source processes
and one target process in the network. Estimate unique, shared, and
synergistic information in the two sources about the target. Call
analyse_network() on the whole network or a set of nodes or call
analyse_single_target() to estimate PID for a single process. See
docstrings of the two functions for more information.
References:
- Williams, P. L., & Beer, R. D. (2010). Nonnegative Decomposition of
Multivariate Information, 1–14. Retrieved from
http://arxiv.org/abs/1004.2515
- Bertschinger, N., Rauh, J., Olbrich, E., Jost, J., & Ay, N. (2014).
Quantifying Unique Information. Entropy, 16(4), 2161–2183.
http://doi.org/10.3390/e16042161
Attributes:
target : int
index of target process
sources : array type
pair of indices of source processes
settings : dict
analysis settings
results : dict
estimated PID
"""
def __init__(self):
self._pid_estimator = None
self.sources = None
self.target = None
self.results = None
super().__init__()
[docs] def analyse_network(self, settings, data, targets, sources):
"""Estimate partial information decomposition for network nodes.
Estimate partial information decomposition (PID) for multiple nodes in
the network.
Note:
For a detailed description of the algorithm and settings see
documentation of the analyse_single_target() method and
references in the class docstring.
Example:
>>> n = 20
>>> alph = 2
>>> x = np.random.randint(0, alph, n)
>>> y = np.random.randint(0, alph, n)
>>> z = np.logical_xor(x, y).astype(int)
>>> data = Data(np.vstack((x, y, z)), 'ps', normalise=False)
>>> settings = {
>>> 'lags_pid': [[1, 1], [3, 2], [0, 0]],
>>> 'alpha': 0.1,
>>> 'alph_s1': alph,
>>> 'alph_s2': alph,
>>> 'alph_t': alph,
>>> 'max_unsuc_swaps_row_parm': 60,
>>> 'num_reps': 63,
>>> 'max_iters': 1000,
>>> 'pid_estimator': 'SydneyPID'}
>>> targets = [0, 1, 2]
>>> sources = [[1, 2], [0, 2], [0, 1]]
>>> pid_analysis = BivariatePID()
>>> results = pid_analysis.analyse_network(settings, data, targets,
>>> sources)
Args:
settings : dict
parameters for estimation and statistical testing, see
documentation of analyse_single_target() for details, can
contain
- lags_pid : list of lists of ints [optional] - lags in samples
between sources and target (default=[[1, 1], [1, 1] ...])
data : Data instance
raw data for analysis
targets : list of int
index of target processes
sources : list of lists
indices of the two source processes for each target, e.g.,
[[0, 2], [1, 0]], must have the same length as targets
Returns:
ResultsPID instance
results of network inference, see documentation of
ResultsPID()
"""
# Set defaults for PID estimation.
settings.setdefault("verbose", True)
settings.setdefault("lags_pid", np.array([[1, 1]] * len(targets)))
# Check inputs.
if not len(targets) == len(sources) == len(settings["lags_pid"]):
raise RuntimeError(
"Lists of targets, sources, and lags must have" "the same lengths."
)
list_of_lags = settings["lags_pid"]
# Perform PID estimation for each target individually
results = ResultsPID(
n_nodes=data.n_processes,
n_realisations=data.n_realisations(),
normalised=data.normalise,
)
for t, target in enumerate(targets):
if settings["verbose"]:
print(f"\n####### analysing target with index {t} from list {targets}")
settings["lags_pid"] = list_of_lags[t]
res_single = self.analyse_single_target(settings, data, target, sources[t])
results.combine_results(res_single)
# Get no. realisations actually used for estimation from single target
# analysis.
results.data_properties.n_realisations = (
res_single.data_properties.n_realisations
)
return results
[docs] def analyse_single_target(self, settings, data, target, sources):
"""Estimate partial information decomposition for a network node.
Estimate partial information decomposition (PID) for a target node in
the network.
Note:
For a description of the algorithm and the method see references in
the class and estimator docstrings.
Example:
>>> n = 20
>>> alph = 2
>>> x = np.random.randint(0, alph, n)
>>> y = np.random.randint(0, alph, n)
>>> z = np.logical_xor(x, y).astype(int)
>>> data = Data(np.vstack((x, y, z)), 'ps', normalise=False)
>>> settings = {
>>> 'alpha': 0.1,
>>> 'alph_s1': alph,
>>> 'alph_s2': alph,
>>> 'alph_t': alph,
>>> 'max_unsuc_swaps_row_parm': 60,
>>> 'num_reps': 63,
>>> 'max_iters': 1000,
>>> 'pid_calc_name': 'SydneyPID',
>>> 'lags_pid': [2, 3]}
>>> pid_analysis = BivariatePID()
>>> results = pid_analysis.analyse_single_target(settings=settings,
>>> data=data,
>>> target=0,
>>> sources=[1, 2])
Args: settings : dict parameters for estimator use and statistics:
- pid_estimator : str - estimator to be used for PID estimation
(for estimator settings see the documentation in the
estimators_pid modules)
- lags_pid : list of ints [optional] - lags in samples between
sources and target (default=[1, 1])
- verbose : bool [optional] - toggle console output
(default=True)
data : Data instance
raw data for analysis
target : int
index of target processes
sources : list of ints
indices of the two source processes for the target
Returns: ResultsPID instance results of
network inference, see documentation of
ResultsPID()
"""
# Check input and initialise values for analysis.
self._initialise(settings, data, target, sources)
# Estimate PID and significance.
self._calculate_pid(data)
# Add analysis info.
results = ResultsPID(
n_nodes=data.n_processes,
n_realisations=data.n_realisations(self.current_value),
normalised=data.normalise,
)
results._add_single_result(
settings=self.settings, target=self.target, results=self.results
)
self._reset()
return results
def _initialise(self, settings, data, target, sources):
"""Check input, set initial or default values for analysis settings."""
# Check requested PID estimator.
assert "pid_estimator" in settings, "Estimator was not specified!"
self._pid_estimator = get_estimator(settings["pid_estimator"], settings)
self.settings = settings.copy()
self.settings.setdefault("lags_pid", [1, 1])
self.settings.setdefault("verbose", True)
# Check if provided lags are correct and work with the number of
# samples in the data.
if len(self.settings["lags_pid"]) != 2:
raise RuntimeError("List of lags must have length 2.")
if self.settings["lags_pid"][0] >= data.n_samples:
raise RuntimeError(
f"Lag 1 ({self.settings['lags_pid'][0]}) is larger than the number of "
f"samples in the data set ({data.n_samples})."
)
if self.settings["lags_pid"][1] >= data.n_samples:
raise RuntimeError(
f"Lag 2 ({self.settings['lags_pid'][1]}) is larger than the number of "
"samples in the data set ({data.n_samples})."
)
# Check if target and sources are provided correctly.
if not isinstance(target, int):
raise RuntimeError("Target must be an integer.")
if len(sources) != 2:
raise RuntimeError("List of sources must have length 2.")
if target in sources:
raise RuntimeError(
f"The target ({target}) should not be in the list of sources ({sources})."
)
self.current_value = (target, max(self.settings["lags_pid"]))
self.target = target
# TODO works for single vars only, change to multivariate?
self.sources = self._lag_to_idx(
[
(sources[0], self.settings["lags_pid"][0]),
(sources[1], self.settings["lags_pid"][1]),
]
)
def _calculate_pid(self, data):
# TODO Discuss how and if the following statistical testing should be
# included included. Remove dummy results.
# [orig_pid, sign_1, p_val_1,
# sign_2, p_val_2] = stats.unq_against_surrogates(self, data)
# [orig_pid, sign_shd,
# p_val_shd, sign_syn, p_val_syn] = stats.syn_shd_against_surrogates(
# self,
# sign_1 = sign_2 = sign_shd = sign_syn = False
# p_val_1 = p_val_2 = p_val_shd = p_val_syn = 1.0
target_realisations = data.get_realisations(
self.current_value, [self.current_value]
)[0]
source_1_realisations = data.get_realisations(
self.current_value, [self.sources[0]]
)[0]
source_2_realisations = data.get_realisations(
self.current_value, [self.sources[1]]
)[0]
orig_pid = self._pid_estimator.estimate(
s1=source_1_realisations, s2=source_2_realisations, t=target_realisations
)
if self.settings["verbose"]:
print(
f"\nunq information s1: {orig_pid['unq_s1']:.8f}, s2: {orig_pid['unq_s2']:.8f}"
)
print(
f"shd information: {orig_pid['shd_s1_s2']:.8f}, syn information: {orig_pid['syn_s1_s2']:.8f}"
)
self.results = orig_pid
self.results["source_1"] = self._idx_to_lag([self.sources[0]])
self.results["source_2"] = self._idx_to_lag([self.sources[1]])
self.results["selected_vars_sources"] = [
self.results["source_1"][0],
self.results["source_2"][0],
]
self.results["current_value"] = self.current_value
# self.results['unq_s1_sign'] = sign_1
# self.results['unq_s2_sign'] = sign_2
# self.results['unq_s1_p_val'] = p_val_1
# self.results['unq_s2_p_val'] = p_val_2
# self.results['syn_sign'] = sign_syn
# self.results['syn_p_val'] = p_val_syn
# self.results['shd_sign'] = sign_shd
# self.results['shd_p_val'] = p_val_shd
# TODO make mi_against_surrogates in stats more generic, such that
# it becomes an arbitrary permutation test where one argument gets
# shuffled and then all arguments are passed to the provided estimator
def _reset(self):
"""Reset instance after analysis."""
self.__init__()
del self.results
del self.settings
del self._pid_estimator