__author__ = "Carson Sprock"
__email__ = "csprock@gmail.com"
__version__ = "0.3.1"
import warnings
import numpy as np
from bmdcluster.optimizers.blockdiagonalBMD import run_bd_BMD
from bmdcluster.optimizers.generalBMD import run_BMD
from bmdcluster.initializers.primary_initializer import initialize_general
from bmdcluster.initializers.primary_initializer import initialize_block_diagonal
from bmdcluster.optimizers.generalBMD import _updateA
from bmdcluster.optimizers.blockdiagonalBMD import _bd_updateA
class _BMD:
def __init__(self):
pass
@staticmethod
def _get_labels(M):
labels = np.full(shape=(M.shape[0], ), fill_value=-1)
outliers = M.sum(axis=1) < 1
non_outlier_labels = M[~outliers, :].argmax(axis=1)
labels[~outliers] = non_outlier_labels
return labels
[docs]class blockdiagonalBMD(_BMD):
[docs] def __init__(self, n_clusters, max_iter=100, use_bootstrap=False, b=None, init_ratio=1.0, seed=None):
"""Run the block-diagonal form of the BMD algorithm.
Parameters
----------
n_clusters : int
number of data clusters
max_iter : int, optional
maximum number of optimization iterations, by default 100
use_bootstrap : bool, optional
use bootstrap cluster initialization, by default False
b : int, optional
number of bootstrapped samples to use, by default None
init_ratio : float, optional
fraction of points to randomly initialize, by default 1.0
seed : int, optional
random initialization seed, by default None
Raises
------
ValueError
If :code:`use_bootstrap` is set to True but and :code:`b` is not specified
ValueError
If both :code:`B_ident` and :code:`f_clusters` are not specified
"""
if use_bootstrap and not b:
raise ValueError("Must specify keyword argument 'b' when using bootstrapping.")
self.n_clusters = n_clusters
self.use_bootstrap = use_bootstrap
self.b = b
self.init_ratio = init_ratio
self.seed = seed
self.max_iter = max_iter
super(blockdiagonalBMD, self).__init__()
[docs] def fit(self, W, verbose=False):
"""Fit the model.
Parameters
----------
W : np.array
binary data matrix
verbose : bool, optional
print progress during optimization, by default False
"""
self.W = W
# Initialize cluster indicator matrices.
self.A = initialize_block_diagonal(W=self.W,
n_clusters = self.n_clusters,
use_bootstrap = self.use_bootstrap,
b=self.b,
init_ratio=self.init_ratio,
seed=self.seed)
self.cost, self.A, self.B = run_bd_BMD(self.A, self.W, self.max_iter, verbose)
[docs] def predict(self, W):
"""Predict cluster labels of new data.
Parameters
----------
W : np.array
binary data matrix
Returns
-------
np.array
predicted cluster labels
"""
n = W.shape[0]
A_dummy = np.zeros((n, self.n_clusters))
A_pred = _bd_updateA(A_dummy, self.B, W)
return self._get_labels(A_pred)
[docs] def get_feature_labels(self):
"""Get feature cluster labels after .fit(). Outliers will be labeled -1.
Returns
-------
np.array
feature cluster labels
"""
return self._get_labels(self.B)
[docs] def get_data_labels(self):
"""Get data cluster labels after .fit(). Outliers will be labeled -1.
Returns
-------
np.array
data cluster labels
"""
return self._get_labels(self.A)
[docs] def fit_predict(self, W, verbose=False):
"""Fit the model and return final value of objective function and
cluster assignment labels for the data and features.
Parameters
----------
W : np.array
binary data matrix
verbose : bool, optional
print progress during optimization, by default False
Returns
-------
float
final value of objective function
np.array
data cluster labels
np.array
feature cluster labels
"""
self.fit(W, verbose)
return self.cost, self._get_labels(self.A), self._get_labels(self.B)
[docs]class generalBMD(_BMD):
[docs] def __init__(self, n_clusters, f_clusters=None, B_ident=True, max_iter=100, use_bootstrap=False, b=None, init_ratio=1.0, seed=None):
"""Run the general form of the BMD algorithm.
Parameters
----------
n_clusters : int
number of data clusters
f_clusters : int, optional
number of feature clusters, by default None
B_ident : bool, optional
initialize feature cluster assignment matrix to the identity, by default True
max_iter : int, optional
maximum number of optimization iterations, by default 100
use_bootstrap : bool, optional
use bootstrap cluster initialization, by default False
b : int, optional
number of bootstrapped samples to use, by default None
init_ratio : float, optional
fraction of points to randomly initialize, by default 1.0
seed : int, optional
random initialization seed, by default None
Raises
------
ValueError
If :code:`use_bootstrap` is set to True but and :code:`b` is not specified
ValueError
If both :code:`B_ident` and :code:`f_clusters` are not specified
ValueError
If both :code:`B_ident=True` and :code:`f_clusters` is set
Caution
-------
Setting both :code:`B_ident=True` and :code:`f_clusters` are mutually exclusive options and will result in
an error.
"""
if use_bootstrap and not b:
raise ValueError("Must specify keyword argument 'b' when using bootstrapping.")
if not B_ident and not f_clusters:
raise ValueError("You must one of either 'B_ident' or 'f_clusters'")
if B_ident and f_clusters is not None:
raise ValueError("Cannot set B_ident to True and set f_clusters")
self.n_clusters = n_clusters
self.B_ident = B_ident
self.use_bootstrap = use_bootstrap
self.b = b
self.init_ratio = init_ratio
self.f_clusters = f_clusters
self.seed = seed
self.max_iter = max_iter
super(generalBMD, self).__init__()
[docs] def fit(self, W, verbose=False):
"""Fit the model.
Parameters
----------
W : np.array
binary data matrix
verbose : bool, optional
print progress during optimization, by default False
"""
self.W = W
# Initialize cluster indicator matrices.
self.A, self.B = initialize_general(W=self.W,
n_clusters = self.n_clusters,
use_bootstrap = self.use_bootstrap,
B_ident = self.B_ident,
b=self.b,
init_ratio=self.init_ratio,
seed=self.seed,
f_clusters=self.f_clusters)
self.cost, self.A, self.B, self.X = run_BMD(self.A, self.B, self.W, self.max_iter, verbose)
[docs] def predict(self, W):
"""Predict cluster labels of new data.
Parameters
----------
W : np.array
binary data matrix
Returns
-------
np.array
predicted cluster labels
"""
n = W.shape[0]
A_dummy = np.zeros((n, self.n_clusters))
A_pred = _updateA(A_dummy, self.B, self.X, W)
return self._get_labels(A_pred)
[docs] def get_feature_labels(self):
"""Get feature cluster labels after .fit(). Outliers will be labeled -1.
Returns
-------
np.array
feature cluster labels
"""
return self._get_labels(self.B)
[docs] def get_data_labels(self):
"""Get data cluster labels after .fit(). Outliers will be labeled -1.
Returns
-------
np.array
data cluster labels
"""
return self._get_labels(self.A)
[docs] def fit_predict(self, W, verbose=False):
"""Fit the model and return final value of objective function and
cluster assignment labels for the data and features.
Parameters
----------
W : np.array
binary data matrix
verbose : bool, optional
print progress during optimization, by default False
Returns
-------
float
final value of objective function
np.array
data cluster labels
np.array
feature cluster labels
"""
self.fit(W, verbose)
return self.cost, self._get_labels(self.A), self._get_labels(self.B)