Source code for bmdcluster

__author__ = "Carson Sprock"
__email__ = "csprock@gmail.com"
__version__ = "0.3.1"


import warnings
import numpy as np

from bmdcluster.optimizers.blockdiagonalBMD import run_bd_BMD
from bmdcluster.optimizers.generalBMD import run_BMD
from bmdcluster.initializers.primary_initializer import initialize_general
from bmdcluster.initializers.primary_initializer import initialize_block_diagonal
from bmdcluster.optimizers.generalBMD import _updateA
from bmdcluster.optimizers.blockdiagonalBMD import _bd_updateA

class _BMD:

    def __init__(self):
        pass

    @staticmethod
    def _get_labels(M):

        labels = np.full(shape=(M.shape[0], ), fill_value=-1)
        outliers = M.sum(axis=1) < 1
        non_outlier_labels = M[~outliers, :].argmax(axis=1)
        labels[~outliers] = non_outlier_labels

        return labels


[docs]class blockdiagonalBMD(_BMD):

[docs]    def __init__(self, n_clusters, max_iter=100, use_bootstrap=False, b=None, init_ratio=1.0, seed=None):
        """Run the block-diagonal form of the BMD algorithm. 
        
        Parameters
        ----------
        n_clusters : int
            number of data clusters
        max_iter : int, optional
            maximum number of optimization iterations, by default 100
        use_bootstrap : bool, optional
            use bootstrap cluster initialization, by default False
        b : int, optional
            number of bootstrapped samples to use, by default None
        init_ratio : float, optional
            fraction of points to randomly initialize, by default 1.0
        seed : int, optional
            random initialization seed, by default None
        
        Raises
        ------
        ValueError
            If :code:`use_bootstrap` is set to True but and :code:`b` is not specified
        ValueError
            If both :code:`B_ident` and :code:`f_clusters` are not specified
            
        """


        if use_bootstrap and not b:
            raise ValueError("Must specify keyword argument 'b' when using bootstrapping.")

        self.n_clusters = n_clusters
        self.use_bootstrap = use_bootstrap
        self.b = b
        self.init_ratio = init_ratio
        self.seed = seed
        self.max_iter = max_iter

        super(blockdiagonalBMD, self).__init__()

[docs]    def fit(self, W, verbose=False):
        """Fit the model.
        
        Parameters
        ----------
        W : np.array
            binary data matrix
        verbose : bool, optional
            print progress during optimization, by default False
        
        """

        self.W = W

        # Initialize cluster indicator matrices.
        self.A = initialize_block_diagonal(W=self.W, 
                                             n_clusters = self.n_clusters,
                                             use_bootstrap = self.use_bootstrap,
                                             b=self.b,
                                             init_ratio=self.init_ratio,
                                             seed=self.seed)

        self.cost, self.A, self.B = run_bd_BMD(self.A, self.W, self.max_iter, verbose)


[docs]    def predict(self, W):
        """Predict cluster labels of new data. 
        
        Parameters
        ----------
        W : np.array
            binary data matrix
        
        Returns
        -------
        np.array
            predicted cluster labels
        """
        
        n = W.shape[0]
        A_dummy = np.zeros((n, self.n_clusters))

        A_pred = _bd_updateA(A_dummy, self.B, W)

        return self._get_labels(A_pred)


[docs]    def transform(self, W):
        """Predict cluster assignment matrx of new data
        
        Parameters
        ----------
        W : np.array
            binary data matrix
        
        Returns
        -------
        np.array
            predicted cluster assignment matrix
        """
        
        n = W.shape[0]
        A_dummy = np.zeros((n, self.n_clusters))

        A_pred = _bd_updateA(A_dummy, self.B, W)

        return A_pred


[docs]    def get_feature_labels(self):
        """Get feature cluster labels after .fit(). Outliers will be labeled -1.
        
        Returns
        -------
        np.array
            feature cluster labels
        """
        return self._get_labels(self.B)

    
[docs]    def get_data_labels(self):
        """Get data cluster labels after .fit(). Outliers will be labeled -1.
        
        Returns
        -------
        np.array
            data cluster labels
        """
        return self._get_labels(self.A)


[docs]    def fit_predict(self, W, verbose=False):
        """Fit the model and return final value of objective function and 
        cluster assignment labels for the data and features.
        
        Parameters
        ----------
        W : np.array
            binary data matrix
        verbose : bool, optional
            print progress during optimization, by default False
        
        Returns
        -------
        float
            final value of objective function
        np.array
            data cluster labels
        np.array
            feature cluster labels
        """


        self.fit(W, verbose)

        return self.cost, self._get_labels(self.A), self._get_labels(self.B)


[docs]    def fit_transform(self, W, verbose=False):
        """Fit the model and return final value of objective function
        and final values of the data and feature cluster assignment 
        matrices A and B, whose entries are cluster affinity scores.
        
        Parameters
        ----------
        W : np.array
            binary data matrix
        verbose : bool, optional
            print progress during optimization, by default False
        
        Returns
        -------
        float
            final cost of objective function
        np.array
            final value of data cluster assignment matrix A
        np.array
            final value of feature cluster assignment matrix B
            
        """


        self.fit(W, verbose)

        return self.cost, self.A, self.B



[docs]class generalBMD(_BMD):

[docs]    def __init__(self, n_clusters, f_clusters=None, B_ident=True, max_iter=100, use_bootstrap=False, b=None, init_ratio=1.0, seed=None):
        """Run the general form of the BMD algorithm.
        
        Parameters
        ----------
        n_clusters : int
            number of data clusters
        f_clusters : int, optional
            number of feature clusters, by default None
        B_ident : bool, optional
            initialize feature cluster assignment matrix to the identity, by default True
        max_iter : int, optional
            maximum number of optimization iterations, by default 100
        use_bootstrap : bool, optional
            use bootstrap cluster initialization, by default False
        b : int, optional
            number of bootstrapped samples to use, by default None
        init_ratio : float, optional
            fraction of points to randomly initialize, by default 1.0
        seed : int, optional
            random initialization seed, by default None
        
        Raises
        ------
        ValueError
            If :code:`use_bootstrap` is set to True but and :code:`b` is not specified
        ValueError
            If both :code:`B_ident` and :code:`f_clusters` are not specified
        ValueError
            If both :code:`B_ident=True` and :code:`f_clusters` is set

        Caution
        -------
        Setting both :code:`B_ident=True` and :code:`f_clusters` are mutually exclusive options and will result in 
        an error. 

        """

        if use_bootstrap and not b:
            raise ValueError("Must specify keyword argument 'b' when using bootstrapping.")

        if not B_ident and not f_clusters:
            raise ValueError("You must one of either 'B_ident' or 'f_clusters'")

        if B_ident and f_clusters is not None:
            raise ValueError("Cannot set B_ident to True and set f_clusters")

        self.n_clusters = n_clusters
        self.B_ident = B_ident
        self.use_bootstrap = use_bootstrap
        self.b = b
        self.init_ratio = init_ratio
        self.f_clusters = f_clusters
        self.seed = seed
        self.max_iter = max_iter


        super(generalBMD, self).__init__()

[docs]    def fit(self, W, verbose=False):
        """Fit the model.
        
        Parameters
        ----------
        W : np.array
            binary data matrix
        verbose : bool, optional
            print progress during optimization, by default False
        
        """
        self.W = W

        # Initialize cluster indicator matrices.
        self.A, self.B = initialize_general(W=self.W, 
                                             n_clusters = self.n_clusters,
                                             use_bootstrap = self.use_bootstrap,
                                             B_ident = self.B_ident,
                                             b=self.b,
                                             init_ratio=self.init_ratio,
                                             seed=self.seed,
                                             f_clusters=self.f_clusters)

        self.cost, self.A, self.B, self.X = run_BMD(self.A, self.B, self.W, self.max_iter, verbose)


[docs]    def predict(self, W):
        """Predict cluster labels of new data. 
        
        Parameters
        ----------
        W : np.array
            binary data matrix
        
        Returns
        -------
        np.array
            predicted cluster labels
        """

        n = W.shape[0]
        A_dummy = np.zeros((n, self.n_clusters))

        A_pred = _updateA(A_dummy, self.B, self.X, W)

        return self._get_labels(A_pred)


[docs]    def transform(self, W):
        """Predict cluster assignment matrx of new data
        
        Parameters
        ----------
        W : np.array
            binary data matrix
        
        Returns
        -------
        np.array
            predicted cluster assignment matrix
        """
        
        n = W.shape[0]
        A_dummy = np.zeros((n, self.n_clusters))

        A_pred = _updateA(A_dummy, self.B, self.X, W)

        return A_pred


[docs]    def get_feature_labels(self):
        """Get feature cluster labels after .fit(). Outliers will be labeled -1.
        
        Returns
        -------
        np.array
            feature cluster labels
        """
        return self._get_labels(self.B)

    
[docs]    def get_data_labels(self):
        """Get data cluster labels after .fit(). Outliers will be labeled -1.
        
        Returns
        -------
        np.array
            data cluster labels
        """
        return self._get_labels(self.A)


[docs]    def fit_predict(self, W, verbose=False):
        """Fit the model and return final value of objective function and 
        cluster assignment labels for the data and features.
        
        Parameters
        ----------
        W : np.array
            binary data matrix
        verbose : bool, optional
            print progress during optimization, by default False
        
        Returns
        -------
        float
            final value of objective function
        np.array
            data cluster labels
        np.array
            feature cluster labels
        """

        self.fit(W, verbose)

        return self.cost, self._get_labels(self.A), self._get_labels(self.B)


[docs]    def fit_transform(self, W, verbose):
        """Fit the model and return final value of objective function
        and final values of the data and feature cluster assignment 
        matrices A and B, whose entries are cluster affinity scores.
        
        Parameters
        ----------
        W : np.array
            binary data matrix
        verbose : bool, optional
            print progress during optimization, by default False
        
        Returns
        -------
        float
            final cost of objective function
        np.array
            final value of data cluster assignment matrix A
        np.array
            final value of feature cluster assignment matrix B
        """

        self.fit(W, verbose)

        return self.cost, self.A, self.B