diff --git a/README.md b/README.md index 644ae73c28285d064aab10a54446a0e0d956465a..b87d9e2736a22d7e336090b9d1c3e9ca0f0f0cbd 100644 --- a/README.md +++ b/README.md @@ -29,8 +29,7 @@ seed = 240820 # Run NMFProfiler model = NMFProfiler( - omic1=ToyExample().omic1, - omic2=ToyExample().omic2, + omics=[ToyExample().omic1, ToyExample().omic2], y=ToyExample().y, seed=seed, as_sklearn=False, @@ -42,8 +41,8 @@ print(res) # Visualize analyzed datasets (samples x features) ToyExample().y # 2 groups -res.heatmap(obj_to_viz="omic1", width=15, height=6, path="") -res.heatmap(obj_to_viz="omic2", width=15, height=6, path="") +res.heatmap(obj_to_viz="omic", width=15, height=6, path="", omic_number=1) +res.heatmap(obj_to_viz="omic", width=15, height=6, path="", omic_number=2) ```  @@ -60,8 +59,8 @@ res.heatmap(obj_to_viz="W", width=10, height=10, path="") ```python # Visualize signature matrices H1 and H2 obtained (2 x features) -res.heatmap(obj_to_viz="H1", width=15, height=6, path="") -res.heatmap(obj_to_viz="H2", width=15, height=6, path="") +res.heatmap(obj_to_viz="H", width=15, height=6, path="", omic_number=1) +res.heatmap(obj_to_viz="H", width=15, height=6, path="", omic_number=2) ```  diff --git a/codemeta.json b/codemeta.json index 030de6ab5d4936beac19f10c09262dd57b6beb24..1efac7b9bc7bd63a548853f3721a9c244ac036a7 100644 --- a/codemeta.json +++ b/codemeta.json @@ -5,9 +5,9 @@ "codeRepository": "https://forgemia.inra.fr/omics-integration/nmfprofiler", "dateCreated": "2024-06-24", "datePublished": "2024-08-22", - "dateModified": "2024-08-23", + "dateModified": "2024-12-18", "name": "NMFProfiler", - "version": "0.2.1", + "version": "0.3.O", "description": "NMFProfiler: an integrative supervised Non-Negative Matrix Factorization to extract typical profiles of groups of interest combining two different datasets.", "applicationCategory": "Biostatistics", "funding": "2022/0051", diff --git a/docs/source/conf.py b/docs/source/conf.py index cead502277ade12751b9063543106b7d230e99f5..546b33a468301d4903cc329345bf4406dc14458e 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -21,7 +21,7 @@ sys.path.insert(0, os.path.abspath("../..")) project = "NMFProfiler" copyright = "2024, Aurélie Mercadié" author = "Aurélie Mercadié" -release = "0.2.1" +release = "0.3.0" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/nmfprofiler/nmfprofiler.py b/nmfprofiler/nmfprofiler.py index 5b841efcf43688baf18399825d3d2c78489b5d33..ec87ed36c5406b45808153493520f5682f3cda8e 100644 --- a/nmfprofiler/nmfprofiler.py +++ b/nmfprofiler/nmfprofiler.py @@ -11,6 +11,8 @@ groups from copy import deepcopy +import itertools + import warnings import matplotlib.pyplot as plt import seaborn as sns @@ -37,7 +39,7 @@ def _prox_l1_pos(v, lambd): return np.maximum(v - lambd, 0) -def _update_W(W, omic1, omic2, H1, H2, mu): +def _update_W(W, omics, Hs, mu): """Update the matrix containing contributions of individuals to latent components. @@ -45,15 +47,11 @@ def _update_W(W, omic1, omic2, H1, H2, mu): ---------- :W: ndarray of shape (n_samples x K), contributions of individuals to latent components. - :omic1: ndarray of shape (n_samples x n_features_omic1), values of - features from (omic1) measured on (n_samples). - :omic2: ndarray of shape (n_samples x n_features_omic2), values of - features from (omic2) measured on the same (n_samples) samples as - (omic1). - :H1: ndarray of shape (K x n_features_omic1), latent components - built on (omic1). - :H2: ndarray of shape (K x n_features_omic2), latent components built on - (omic2). + :omics: list of (n_omics) ndarrays of shape (n_samples x n_features_omicj), + values of features from (omicj) measured on the same + (n_samples) samples. + :Hs: list of (n_omics) ndarrays of shape (K x n_features_omicj), + latent components built on (omicj). :mu: float, value for parameter `mu` from objective function. @@ -61,12 +59,12 @@ def _update_W(W, omic1, omic2, H1, H2, mu): ------- Newly updated W. """ - B = ( - safe_sparse_dot(H1, H1.T) - + safe_sparse_dot(H2, H2.T) - + (mu * np.identity(W.shape[1])) - ) - C = safe_sparse_dot(omic1, H1.T) + safe_sparse_dot(omic2, H2.T) + B = mu * np.identity(W.shape[1]) + for j in range(len(Hs)): + B += safe_sparse_dot(Hs[j], Hs[j].T) + C = np.zeros((W.shape[0], W.shape[1])) + for j in range(len(Hs)): + C += safe_sparse_dot(omics[j], Hs[j].T) return multiply(W, divide(C, safe_sparse_dot(W, B))) @@ -85,7 +83,7 @@ def _update_H( H_grad=None ): """Update the matrix containing latent components built on omic j. - (j referring to the omic dataset number, either 1 or 2) + (j referring to the omic dataset number, between 1 and J) Parameters ---------- @@ -154,7 +152,7 @@ def _update_H( def _update_Beta(omic, h_k, y_k, gamma, Beta): """Update the regression coefficients linked to omic j and component k. - (j referring to the omic dataset number, either 1 or 2) + (j referring to the omic dataset number, between 1 and J) Parameters ---------- @@ -191,21 +189,18 @@ def _update_Beta(omic, h_k, y_k, gamma, Beta): def _computeF( - omic1, - omic2, + omics, Y, W, - H1, - H2, - Beta1, - Beta2, + Hs, + Betas, mu, lambdA, gamma, K_gp, details=False ): - """Compute the value of F given omic1, omic2, Y, W, H1, H2, Beta1, Beta2 + """Compute the value of F given Y, W, each omicj, Hj, Betaj and some hyperparameters (i.e. gamma, lambda and mu). Calculate the objective function value, as well as each error term, @@ -213,23 +208,18 @@ def _computeF( Parameters ---------- - :omic1: ndarray of shape (n_samples x n_features_omic1), values of - features from (omic1) measured on (n_samples). - :omic2: ndarray of shape (n_samples x n_features_omic2), values of - features from (omic2) measured on the same (n_samples) samples - as (omic1). + :omics: list of (n_omics) ndarrays of shape (n_samples x n_features_omicj), + values of features from (omicj) measured on the same + (n_samples) samples. :Y: ndarray of shape (n_samples x U), one-hot encoding of (y) indicating to which group each sample belongs. :W: ndarray of shape (n_samples x K), contributions of individuals to latent components. - :H1: ndarray of shape (K x n_features_omic1), latent components built on - (omic1). - :H2: ndarray of shape (K x n_features_omic2), latent components built on - (omic2). - :Beta1: ndarray of shape (K x 1), regression coefficients for projection - of individuals from (omic1) onto latent components from H^(1). - :Beta2: ndarray of shape (K x 1), regression coefficients for projection - of individuals from (omic2) onto latent components from H^(2). + :Hs: list of (n_omics) ndarrays of shape (K x n_features_omicj), + latent components built on (omicj). + :Betas: list of (n_omics) ndarrays of shape (K x 1), regression + coefficients for projection of individuals from (omicj) + onto latent components from H^(j). :mu: float, value for parameter `mu` from objective function. :lambdA: float, value for parameter `lambda` from objective function. :gamma: float, value for parameter `gamma` from objective function. @@ -243,24 +233,22 @@ def _computeF( Either the value of the objective function (i.e. the global loss) alone, or accompagned by each specific term to obtain it. """ - distort1 = 0.5 * (norm(omic1 - safe_sparse_dot(W, H1)) ** 2) - distort2 = 0.5 * (norm(omic2 - safe_sparse_dot(W, H2)) ** 2) regul = (mu / 2) * np.trace(safe_sparse_dot(W.T, W)) - sparse1 = lambdA * norm(H1, 1) - sparse2 = lambdA * norm(H2, 1) - pred1 = (gamma / 2) * ( - norm(Y - multi_dot([omic1, H1.T[:, 0:K_gp], - np.diag(Beta1[0:K_gp])])) ** 2 - ) - pred2 = (gamma / 2) * ( - norm(Y - multi_dot([omic2, H2.T[:, 0:K_gp], - np.diag(Beta2[0:K_gp])])) ** 2 - ) + distort, sparse, pred = ([] for i in range(3)) + for j in range(len(omics)): + distort.append(0.5 * (norm(omics[j] - safe_sparse_dot(W, Hs[j])) ** 2)) + sparse.append(lambdA * norm(Hs[j], 1)) + pred.append( + (gamma / 2) * ( + norm(Y - multi_dot([omics[j], Hs[j].T[:, 0:K_gp], + np.diag(Betas[j][0:K_gp])])) ** 2 + ) + ) - loss = distort1 + distort2 + regul + sparse1 + sparse2 + pred1 + pred2 + loss = sum(distort) + regul + sum(sparse) + sum(pred) if details: - res = [loss, distort1, distort2, sparse1, sparse2, regul, pred1, pred2] + res = [loss, distort, sparse, regul, pred] return loss if details is False else res @@ -332,7 +320,7 @@ def _linesearch( """Find the most suited value for eta^(j). Calculate the optimal gradient descent step size eta^(j) to update H^(j) - (j = 1,2). + (j = 1,...,J). Instead of reuse each time the initial eta_0^(j), use the last one found, eta_(t-1)^(j). @@ -412,22 +400,18 @@ def _linesearch( def _analytic_solver( - omic1, - omic2, + omics, Y, W, - H1, - H2, - Beta1, - Beta2, + Hs, + Betas, params, + current_etas, as_sklearn, backtrack, max_iter_back, - H1_loss, - H2_loss, - H1_grad, - H2_grad, + Hlosses, + Hgrads, K, K_gp, verbose, @@ -441,36 +425,31 @@ def _analytic_solver( Parameters ---------- - :omic1: ndarray of shape (n_samples x n_features_omic1), values of - features from (omic1) measured on (n_samples). - :omic2: ndarray of shape (n_samples x n_features_omic2), values of - features from (omic2) measured on the same (n_samples) samples as - (omic1). + :omics: list of (n_omics) ndarrays of shape (n_samples x n_features_omicj), + values of features from (omicj) measured on the same + (n_samples) samples. :Y: ndarray of shape (n_samples x U), one-hot encoding of (y) indicating to which group each sample belongs. :W: ndarray of shape (n_samples x K), contributions of individuals to latent components, initialized or at (t). - :H1: ndarray of shape (K x n_features_omic1), latent components built on - (omic1), initialized or at (t). - :H2: ndarray of shape (K x n_features_omic2), latent components built on - (omic2), initialized or at (t). - :Beta1: ndarray of shape (K x 1), regression coefficients for projection - of individuals from (omic1) onto (H1), initialized or at (t). - :Beta2: ndarray of shape (K x 1), regression coefficients for projection - of individuals from (omic2) onto (H2), initialized or at (t). - :params: dict of length 8 (optional), values for parameters `gamma`, - `lambda`, `mu` from objective function, `eta1`, `eta2` for prox, and + :Hs: list of (n_omics) ndarrays of shape (K x n_features_omicj), + latent components built on (omicj), initialized or at (t). + :Betas: list of (n_omics) ndarrays of shape (K x 1), regression + coefficients for projection of individuals from (omicj) + onto H^(j),initialized or at (t). + :params: dict of length 7 (optional), values for parameters `gamma`, + `lambda`, `mu` from objective function, `eta` for prox, and `alpha`, `sigma`, `m_back` for linesearch(). - By default, gamma = 0.005, lambda = 1e-3, mu = 1e-3, eta1 = eta2 = 1, + By default, gamma = 0.005, lambda = 1e-3, mu = 1e-3, eta = 1, alpha = 2, sigma = 1e-9 and m_back = 1. + :current_etas: list of (n_omics) floats, value of parameter `etaj`. :as_sklearn: boolean, whether or not a modified version of the MU updates from scikit-learn is used. :backtrack: boolean, if Backtrack LineSearch is performed or not. :max_iter_back: int, maximum number of iterations for the backtrack. - :H1_loss: list, values of the marginal loss for H^(1). - :H2_loss: list, values of the marginal loss for H^(2). - :H1_grad: list, values of gradient in H^(1) (before update). - :H2_grad: list, values of gradient in H^(2) (before update). + :Hlosses: list of (n_omics) lists, vsalues of the marginal loss for H^(j). + :Hgrads: list of (n_omics) lists, values of gradient in H^(j) + (before update). :K: int (optional), number of latent components to build. By default, K = 2. :K_gp: int, number of components dedicated to groups profiling. @@ -480,20 +459,15 @@ def _analytic_solver( Returns ------- :W_new: ndarray of shape (n_samples x K), updated, or (t+1), W matrix. - :H1_new: ndarray of shape (K x n_features_omic1), updated, or (t+1), - H1 matrix. - :H2_new: ndarray of shape (K x n_features_omic2), updated, or (t+1), - H2 matrix. - :Beta1_new: vector of length (K), updated, or (t+1), Beta1 vector. - :Beta2_new: vector of length (K), updated, or (t+1), Beta2 vector. - :params[`eta1`]: float, updated value of parameter `eta1`. - :params[`eta2`]: float, updated value of parameter `eta2`. - :H1_loss: list, augmented list of marginal losses for H^(1). - :H2_loss: list, augmented list of marginal losses for H^(2). - :H1_grad: list, augmented list of gradient values in H^(1) - (before update). - :H2_grad: list, augmented list of gradient values in H^(2) - (before update). + :Hs_new: list of (n_omics) ndarrays of shape (K x n_features_omicj), + updated, or (t+1), Hj matrices. + :Betas_new: list of (n_omics) vectors of length (K), updated, or (t+1), + Betaj vectors. + :current_etas: list of (n_omics) floats, updated value of parameter `etaj`. + :Hlosses: list of (n_omics) lists, augmented list of + marginal losses for H^(j). + :Hgrads: list of (n_omics) lists, augmented list of + gradient values in H^(j) (before update). Note @@ -501,144 +475,87 @@ def _analytic_solver( See (Fevotte, 2011) and sklearn.decomposition.NMF source code for choice of parameters. """ - # W update with new values of Hs - W_new = _update_W(W, omic1, omic2, H1, H2, params["mu"]) + W_new = _update_W(W, omics, Hs, params["mu"]) # Hs updates (either MU or Proximal) - H1_new, H1_grad = _update_H( - H1, - W_new, - omic1, - Beta1, - Y, - params["gamma"], - params["eta1"], - params["lambda"], - as_sklearn=as_sklearn, - grad=True, - H_grad=H1_grad, - ) - H2_new, H2_grad = _update_H( - H2, - W_new, - omic2, - Beta2, - Y, - params["gamma"], - params["eta2"], - params["lambda"], - as_sklearn=as_sklearn, - grad=True, - H_grad=H2_grad, - ) + Hs_new = [0 for j in range(len(omics))] + for j in range(len(omics)): + Hs_new[j], Hgrads[j] = _update_H( + Hs[j], + W_new, + omics[j], + Betas[j], + Y, + params["gamma"], + current_etas[j], + params["lambda"], + as_sklearn=as_sklearn, + grad=True, + H_grad=Hgrads[j], + ) if backtrack and not as_sklearn: - # LineSearch for H1 - H1_loss.append( - _computeMargH( - omic1, - Y, - W_new, - H1_new, - Beta1, - params["gamma"], - params["lambda"], - K_gp + # LineSearch for each H + for j in range(len(omics)): + Hlosses[j].append( + _computeMargH( + omics[j], + Y, + W_new, + Hs_new[j], + Betas[j], + params["gamma"], + params["lambda"], + K_gp + ) ) - ) - H1_new, H1_loss, params["eta1"] = _linesearch( - H_old=H1, - H=H1_new, - W=W_new, - omic=omic1, - Beta=Beta1, - Y=Y, - gamma=params["gamma"], - lambdA=params["lambda"], - current_eta=params["eta1"], - H_loss=H1_loss, - alpha=params["alpha"], - sigma=params["sigma"], - m_back=params["m_back"], - max_iter_back=max_iter_back, - K_gp=K_gp, - verbose=verbose, - ) - # LineSearch for H2 - H2_loss.append( - _computeMargH( - omic2, - Y, - W_new, - H2_new, - Beta2, - params["gamma"], - params["lambda"], - K_gp + Hs_new[j], Hlosses[j], current_etas[j] = _linesearch( + H_old=Hs[j], + H=Hs_new[j], + W=W_new, + omic=omics[j], + Beta=Betas[j], + Y=Y, + gamma=params["gamma"], + lambdA=params["lambda"], + current_eta=current_etas[j], + H_loss=Hlosses[j], + alpha=params["alpha"], + sigma=params["sigma"], + m_back=params["m_back"], + max_iter_back=max_iter_back, + K_gp=K_gp, + verbose=verbose, ) - ) - H2_new, H2_loss, params["eta2"] = _linesearch( - H_old=H2, - H=H2_new, - W=W_new, - omic=omic2, - Beta=Beta2, - Y=Y, - gamma=params["gamma"], - lambdA=params["lambda"], - current_eta=params["eta2"], - H_loss=H2_loss, - alpha=params["alpha"], - sigma=params["sigma"], - m_back=params["m_back"], - max_iter_back=max_iter_back, - K_gp=K_gp, - verbose=verbose, - ) # Betas updates with new values of Hs # Compute each regression coef. of each component separately # and gather them in a unique vector - Beta1_new = np.array( - [ - _update_Beta( - omic1, H1_new[0, :], Y[:, 0], params["gamma"], Beta1[0] - ), - _update_Beta( - omic1, H1_new[1, :], Y[:, 1], params["gamma"], Beta1[1] - ), - ] - ) - - Beta2_new = np.array( - [ - _update_Beta( - omic2, H2_new[0, :], Y[:, 0], params["gamma"], Beta2[0] - ), - _update_Beta( - omic2, H2_new[1, :], Y[:, 1], params["gamma"], Beta2[1] - ), - ] - ) - - # Put to zero all coefficients linked to components k when k > K_gp - if K_gp < K: - Beta1_new[K_gp:] = 0 - Beta2_new[K_gp:] = 0 + Betas_new = [0 for j in range(len(omics))] + for j in range(len(omics)): + Betas_new[j] = np.array( + [ + _update_Beta( + omics[j], + Hs_new[j][k, :], + Y[:, k], + params["gamma"], + Betas[j][k] + ) for k in range(K) + ] + ) + # Put to zero all coefficients linked to components k when k > K_gp + if K_gp < K: + Betas_new[j][K_gp:] = 0 return ( W_new, - H1_new, - H2_new, - Beta1_new, - Beta2_new, - params["eta1"], - params["eta2"], - H1_loss, - H2_loss, - H1_grad, - H2_grad, + Hs_new, + Betas_new, + current_etas, + Hlosses, + Hgrads, ) @@ -660,48 +577,44 @@ class NMFProfiler: The goal of the method is to find relationships between OMICS corresponding to typical profiles of distinct groups of individuals. The - objective is to find two decompositions, one for each omic, with a common + objective is to find one decomposition for each omic, with a common contribution of individuals, in which latent factor matrices are sparse. The objective function :math:`\mathcal{F} - (\mathbf{W},\mathbf{H}^{(1)},\mathbf{H}^{(2)},\beta^{(1)},\beta^{(2)})` + (\mathbf{W},\mathbf{H}^{(1)},\ldots,\mathbf{H}^{(J)},\beta^{(1)},\ldots,\beta^{(J)})` is as follows: .. math:: - & \dfrac{1}{2}\left( \sum_{j=1}^2\| \mathbf{X}^{(j)} - + & \dfrac{1}{2}\left( \sum_{j=1}^J\| \mathbf{X}^{(j)} - \mathbf{WH}^{(j)} \|_F^2 \right) - &+ \dfrac{\gamma}{2}\left( \sum_{j=1}^2\| \mathbf{Y} - + &+ \dfrac{\gamma}{2}\left( \sum_{j=1}^J\| \mathbf{Y} - \mathbf{X}^{(j)} \mathbf{H}^{(j)\top} \text{Diag}(\beta^{(j)}) \|_F^2 \right) - &+ \sum_{j=1}^{2} \lambda\|\mathbf{H}^{(j)}\|_1 + + &+ \sum_{j=1}^{J} \lambda\|\mathbf{H}^{(j)}\|_1 + \dfrac{\mu}{2}\|\mathbf{W \|_F^2} Parameters ---------- - :omic1: array-like of shape (n_samples x n_features_omic1). - First omics dataset. (n_samples) is the number of samples and - (n_features_omic1) the number of features. - - :omic2: array-like of shape (n_samples x n_features_omic2). - Second omics dataset. (n_samples) is the number of samples and - (n_features_omic2) is the number of features. - WARNING: (omic2) must contain the exact same samples in the same order - as (omic1). + :omics: list of (n_omics) array-like of shape + (n_samples x n_features_omicj). + Omics datasets. (n_samples) is the number of samples and + (n_features_omicj) the number of features of (omicj). + WARNING: each (omicj) must contain the exact same samples + in the same order. :y: vector of length (n_samples). - Group to which each sample belongs (same order than the rows of omic1 - and omic2). + Group to which each sample belongs (same order than the rows of omicj). - :params: dict of length 8, optional. + :params: dict of length 7, optional. Contains, in this order, values for hyperparameters `gamma`, `lambda`, - `mu` (from the objective function), for `eta1`, `eta2` (when proximal + `mu` (from the objective function), for `eta` (when proximal optimization is used), and for `alpha`, `sigma`, `m_back` (for `linesearch()`). - By default, gamma = 1e-2, lambda = 1e-3, mu = 1e-3, eta1 = eta2 = 1, + By default, gamma = 1e-2, lambda = 1e-3, mu = 1e-3, eta = 1, alpha = 2, sigma = 1e-9, and m_back = 1. In the objective function, `lambda` and `gamma` are additionally multiplied by (n_samples). @@ -757,44 +670,31 @@ class NMFProfiler: Attributes ---------- - :W: ndarray of shape (n_samples x 2). + :W: ndarray of shape (n_samples x K). Contributions of individuals in each latent component. - :W_init: ndarray of shape (n_samples x 2). + :W_init: ndarray of shape (n_samples x K). Initial version of (W). - :H1: ndarray of shape (2 x n_features_omic1). - Latent components for (omic1). - - :H1_init: ndarray of shape (2 x n_features_omic1). - Initial version of (H1). - - :H2: ndarray of shape (2 x n_features_omic2). - Latent components for (omic2). - - :H2_init: ndarray of shape (2 x n_features_omic2). - Initial version of (H2). - - :Beta1: ndarray of shape (2 x 1). - Regression coefficients for the projection of individuals from (omic1) - onto (H1). + :Hs: list of (n_omics) ndarrays of shape (K x n_features_omicj). + Latent components Hj for (omicj). - :Beta1_init: ndarray of shape (2 x 1). - Initial version of (Beta1). + :Hs_init: list of (n_omics) ndarrays of shape (K x n_features_omicj). + Initial version of (Hj). - :Beta2: ndarray of shape (K x 1). - Regression coefficients for the projection of individuals from (omic2) - onto (H2). + :Betas: list of (n_omics) ndarrays of shape (K x 1). + Regression coefficients for the projection of individuals from (omicj) + onto (Hj). - :Beta2_init: ndarray of shape (K x 1). - Initial version of (Beta2). + :Betas_init: list of (n_omics) ndarrays of shape (K x 1). + Initial version of (Betaj). :n_iter: int. Final number of iterations (up to convergence or maximum number of iterations is reached). - :df_etas: `pd.dataFrame` of shape (n_iter+1, 2). - Optimal values for parameters `eta1` and `eta2` at each iteration. + :df_etas: `pd.dataFrame` of shape (n_iter+1, n_omics). + Optimal values for parameter `eta` at each iteration. :df_errors: `pd.dataFrame` of shape (n_iter+1, 9). All error terms for each iteration and omic j. @@ -802,10 +702,14 @@ class NMFProfiler: :df_ldaperf: `pd.DataFrame` of shape (n_iter+1, 13). All metrics linked to LDA at each iteration and omic j. - :df_grads: `pd.DataFrame` of shape (n_iter+1, 2) - Values of H^(1) and H^(2) gradients before being updated, at each + :df_grads: `pd.DataFrame` of shape (n_iter+1, n_omics) + Values of H^(j) gradients before being updated, at each iteration. + :df_y: `pd.DataFrame` of shape (n, 2) + Original levels in :y: as well as their associated + integer. + :runningtime: float. Running time of the method measured through `process_time()`. @@ -854,7 +758,7 @@ class NMFProfiler: >>> y = np.array([1, 1, 1, 0, 0, 0]) >>> seed = 240805 >>> from nmfprofiler import NMFProfiler - >>> model = NMFProfiler(omic1=X1, omic2=X2, y=y, seed=seed) + >>> model = NMFProfiler(omics=[X1, X2], y=y, seed=seed) >>> res = model.fit() >>> print(res) >>> res.heatmap(obj_to_viz="W", height=10, width=10, path="") @@ -863,15 +767,13 @@ class NMFProfiler: def __init__( self, - omic1, - omic2, + omics, y, params={ "gamma": 1e-2, "lambda": 1e-3, "mu": 1e-3, - "eta1": 1.00, - "eta2": 1.00, + "eta": 1.00, "alpha": 2, "sigma": 1e-9, "m_back": 1, @@ -887,8 +789,7 @@ class NMFProfiler: verbose=False, ): - self.omic1 = omic1 - self.omic2 = omic2 + self.omics = omics self.y = y self.params = params self.init_method = init_method @@ -904,15 +805,15 @@ class NMFProfiler: def __str__(self): """Briefly describe inputs and outputs of NMFProfiler.""" print_statement = ( - "NMFProfiler\n-----------\n\nAnalysis run on dataset 1 containing " - f"{self.omic1.shape[1]} features measured on {self.omic1.shape[0]}" - f" samples,\ndataset 2 containing {self.omic2.shape[1]} features " - f"measured on the same {self.omic2.shape[0]} samples.\nSamples " - f"are splitted into {len(np.unique(self.y))} distinct groups.\n\n" - f"NMFProfiler (as_sklearn = {self.as_sklearn}) extracted " - f"{len(np.unique(self.y))} profiles in {self.runningtime} " - "seconds.\n\nFor more information, please refer to help(), " - "GitLab page or contact aurelie.mercadie@inrae.fr." + f"NMFProfiler\n-----------\n\nAnalysis run on {len(self.omics)} " + "datasets containing respectively " + f"{*[self.omics[j].shape[1] for j in range(len(self.omics))],} " + f"features measured on the same {self.omics[0].shape[0]} samples." + f"\nSamples are splitted into {len(np.unique(self.y))} distinct" + f" groups.\n\nNMFProfiler (as_sklearn = {self.as_sklearn}) " + f"extracted {len(np.unique(self.y))} profiles " + f"in {self.runningtime} seconds.\n\nFor more information, please " + "refer to help() or GitLab page." ) return print_statement @@ -935,35 +836,28 @@ class NMFProfiler: Apply a min-max normalization and divide by the square root of the number of features. """ - # First dataset - for i in range(self.omic1.shape[1]): - self.omic1[:, i] = ( - self.omic1[:, i] - np.min(self.omic1[:, i]) - ) / ( - np.max(self.omic1[:, i]) - np.min(self.omic1[:, i]) - ) - self.omic1 = self.omic1 * (1 / np.sqrt(self.omic1.shape[1])) - - # Second dataset - for i in range(self.omic2.shape[1]): - self.omic2[:, i] = ( - self.omic2[:, i] - np.min(self.omic2[:, i]) - ) / ( - np.max(self.omic2[:, i]) - np.min(self.omic2[:, i]) + # For each dataset + for j in range(len(self.omics)): + for i in range(self.omics[j].shape[1]): + self.omics[j][:, i] = ( + self.omics[j][:, i] - np.min(self.omics[j][:, i]) + ) / ( + np.max(self.omics[j][:, i]) - np.min(self.omics[j][:, i]) + ) + self.omics[j] = ( + self.omics[j] * (1 / np.sqrt(self.omics[j].shape[1])) ) - self.omic2 = self.omic2 * (1 / np.sqrt(self.omic2.shape[1])) def _initialize_w_h_beta(self): """Initialize matrices W, H^j and Beta^j. - Several ways to intialize W, H1, H2. - Beta1 and Beta2 are initialized with 1s vectors. + Several ways to intialize W, Hj. + Betaj are initialized with 1s vectors. Note ---- Based on _initialize_nmf of sklearn.decomposition.NMF. """ - # Extract K, the number of latent components, as equal to # the number of distinct groups in y K = len(np.unique(self.y)) @@ -971,9 +865,11 @@ class NMFProfiler: # dedicated to groups, identically to K K_gp = len(np.unique(self.y)) - # For W, H1 and H2: + # For W and Hj: + # 1.0. Initialize H0s list + H0s = [0 for j in range(len(self.omics))] # 1.1. Concatenate omics data sets - omics = np.concatenate((self.omic1, self.omic2), axis=1) + omics = np.concatenate(self.omics, axis=1) # 1.2. Initialize using sklearn function if self.init_method == "random2": # 1.2a. Initialize W with both omics and Hj @@ -984,29 +880,25 @@ class NMFProfiler: init="random", random_state=self.seed ) - *_, H1_0 = _initialize_nmf( - X=self.omic1, - n_components=K, - init="random", - random_state=self.seed - ) - *_, H2_0 = _initialize_nmf( - X=self.omic2, - n_components=K, - init="random", - random_state=self.seed - ) - del _ + for j in range(len(self.omics)): + *_, H0s[j] = _initialize_nmf( + X=self.omics[j], + n_components=K, + init="random", + random_state=self.seed + ) + del _ elif self.init_method == "random3": # 1.2b. FOR IDENTICAL OMICS DATA SETS - Initialize W - # with both omics, H1 with omic1 and H2 as H1 (random) - W0, H1_0 = _initialize_nmf( - X=self.omic1, + # with both omics, H1 with omic1 and other Hj as H1 (random) + W0, H0s[0] = _initialize_nmf( + X=self.omics[0], n_components=K, init="random", random_state=self.seed ) - H2_0 = H1_0.copy() + for j in range(1, len(self.omics)): + H0s[j] = H0s[0].copy() elif self.init_method == "nndsvd2": # 1.2c. Initialize W with both omics and Hj # with specific omic (nndsvd) @@ -1016,36 +908,27 @@ class NMFProfiler: init="nndsvd", random_state=self.seed ) - *_, H1_0 = _initialize_nmf( - X=self.omic1, - n_components=K, - init="nndsvd", - random_state=self.seed - ) - *_, H2_0 = _initialize_nmf( - X=self.omic2, - n_components=K, - init="nndsvd", - random_state=self.seed - ) - del _ + for j in range(len(self.omics)): + *_, H0s[j] = _initialize_nmf( + X=self.omics[j], + n_components=K, + init="nndsvd", + random_state=self.seed + ) + del _ elif self.init_method == "nndsvd3": # 1.2d. Initialize W with each omic then take the mean and # initialize Hj with specific omic (nndsvd) - W_a, H1_0 = _initialize_nmf( - X=self.omic1, - n_components=K, - init="nndsvd", - random_state=self.seed - ) - W_b, H2_0 = _initialize_nmf( - X=self.omic2, - n_components=K, - init="nndsvd", - random_state=self.seed - ) - W0 = (1 / 2) * (W_a + W_b) - del W_a, W_b + W_js = [0 for j in range(len(self.omics))] + for j in range(len(self.omics)): + W_js[j], H0s[j] = _initialize_nmf( + X=self.omics[j], + n_components=K, + init="nndsvd", + random_state=self.seed + ) + W0 = (1 / len(self.omics)) * (sum(W_js)) + del W_js else: # 1.2e. Initialize both with all omics, using whatever method # available in _initialize_nmf(). See help. @@ -1056,20 +939,28 @@ class NMFProfiler: random_state=self.seed ) # 1.2e. Split H matrix - H1_0, H2_0 = np.split( - ary=H0, indices_or_sections=[self.omic1.shape[1]], axis=1 + if len(self.omics) == 2: + indices_or_sections = [self.omics[0].shape[1]] + else: + indices_or_sections = [ + self.omics[j].shape[1] for j in range(len(self.omics)) + ] + H0s = np.split( + ary=H0, + indices_or_sections=indices_or_sections, + axis=1 ) del H0 - # For Beta1 and Beta2: - Beta1_0 = np.repeat(1, K) - Beta2_0 = np.repeat(1, K) - # Put to zero all coefficients linked to component k when k > K_gp - if K_gp < K: - Beta1_0[K_gp:] = 0 - Beta2_0[K_gp:] = 0 + # For Betas: + Beta0s = [0 for j in range(len(self.omics))] + for j in range(len(self.omics)): + Beta0s[j] = np.repeat(1, K) + # Put to zero all coefficients linked to component k when k > K_gp + if K_gp < K: + Beta0s[j][K_gp:] = 0 - return W0, H1_0, H2_0, Beta1_0, Beta2_0 + return W0, H0s, Beta0s def fit(self): """Run NMFProfiler.""" @@ -1077,14 +968,28 @@ class NMFProfiler: _check_inputs_NMFProfiler(self) # Extract K, the number of latent components, as equal - # to the number of distinct groups in y + # to the number of distinct groups, U, in y K = len(np.unique(self.y)) # For now, initialize K_gp, the number of latent components # dedicated to groups, identically to K K_gp = len(np.unique(self.y)) - # Automatically convert a vector encoding U groups into - # a one-hot encoded matrix + # Turn y levels into integers + if self.verbose: + print("\nConverting y levels into integers...") + print("Original levels: ", np.unique(self.y)) + y_init = deepcopy(self.y) + num_levels = {} + idx = 0 + for level in np.unique(self.y): + if level not in num_levels: + num_levels[level] = idx + idx += 1 + self.y = np.array([num_levels[level] for level in self.y]) + if self.verbose: + print("New levels: ", np.unique(self.y), "\n") + + # And automatically convert y into a one-hot encoded matrix Y encoder = OneHotEncoder(handle_unknown="ignore") Y = encoder.fit_transform(self.y.reshape(-1, 1)).toarray() @@ -1092,12 +997,11 @@ class NMFProfiler: self._preprocess_data() # Initialize matrices - W0, H1_0, H2_0, Beta1_0, Beta2_0 = self._initialize_w_h_beta() + W0, Hs_0, Betas_0 = self._initialize_w_h_beta() + self.W_init = W0 - self.H1_init = H1_0 - self.H2_init = H2_0 - self.Beta1_init = Beta1_0 - self.Beta2_init = Beta2_0 + self.Hs_init = Hs_0 + self.Betas_init = Betas_0 # Update lambda and gamma given sample size self._update_params() @@ -1107,55 +1011,35 @@ class NMFProfiler: # Create matrices and vectors to update using initialization # (and keep intact initialized matrices and vectors) - W, H1, H2, Beta1, Beta2 = ( - deepcopy(W0), - deepcopy(H1_0), - deepcopy(H2_0), - deepcopy(Beta1_0), - deepcopy(Beta2_0), - ) + W = deepcopy(W0) + Hs = [deepcopy(Hs_0[j]) for j in range(len(self.omics))] + Betas = [deepcopy(Betas_0[j]) for j in range(len(self.omics))] # Create lists of error terms to enrich iteratively + (error, regul, nb_iters) = ([] for i in range(3)) ( - error, - margH1, - margH2, - gradH1, - gradH2, - distort1, - distort2, - sparsity1, - sparsity2, - regul, - pred1, - pred2, - nb_iters, - ) = ([] for i in range(13)) + margHs, + gradHs, + distorts, + sparsitys, + preds, + ) = ([[] for j in range(len(self.omics))] for _ in range(5)) + # Create lists of LDA performance indicators to enrich iteratively ( - R2adj_1, - R2adj_2, - BIC_1, - BIC_2, - AIC_1, - AIC_2, - F_pval_1, - F_pval_2, - bet1_1, - bet1_2, - bet2_1, - bet2_2, - ) = ([] for i in range(12)) + R2adjs, + BICs, + AICs, + F_pvals, + ) = ([[] for j in range(len(self.omics))] for _ in range(4)) + bets = [[[] for k in range(K)] for j in range(len(self.omics))] loss_init = _computeF( - self.omic1, - self.omic2, + self.omics, Y, W0, - H1_0, - H2_0, - Beta1_0, - Beta2_0, + Hs_0, + Betas_0, self.params["mu"], self.params["lambda"], self.params["gamma"], @@ -1164,72 +1048,55 @@ class NMFProfiler: ) error.append(loss_init[0]) nb_iters.append(0) - distort1.append(loss_init[1]) - distort2.append(loss_init[2]) - sparsity1.append(loss_init[3]) - sparsity2.append(loss_init[4]) - regul.append(loss_init[5]) - pred1.append(loss_init[6]) - pred2.append(loss_init[7]) + regul.append(loss_init[3]) + for j in range(len(self.omics)): + distorts[j].append(loss_init[1][j]) + sparsitys[j].append(loss_init[2][j]) + preds[j].append(loss_init[4][j]) # Keep track of marginal objective functions in Hj with # initial matrices (necessary for linesearch first execution) - margH1.append( - _computeMargH( - self.omic1, - Y, - W0, - H1_0, - Beta1_0, - self.params["gamma"], - self.params["lambda"], - K_gp, - ) - ) - margH2.append( - _computeMargH( - self.omic2, - Y, - W0, - H2_0, - Beta2_0, - self.params["gamma"], - self.params["lambda"], - K_gp, + for j in range(len(self.omics)): + margHs[j].append( + _computeMargH( + self.omics[j], + Y, + W0, + Hs_0[j], + Betas_0[j], + self.params["gamma"], + self.params["lambda"], + K_gp, + ) ) - ) # LDAs with initial matrices - reg1_init = sm.OLS( - self.y, sm.add_constant(safe_sparse_dot(self.omic1, H1_0.T)) - ).fit() - reg2_init = sm.OLS( - self.y, sm.add_constant(safe_sparse_dot(self.omic2, H2_0.T)) - ).fit() - - bet1_1.append(reg1_init.params[1]) - bet2_1.append(reg1_init.params[2]) - bet1_2.append(reg2_init.params[1]) - bet2_2.append(reg2_init.params[2]) - R2adj_1.append(reg1_init.rsquared_adj) - R2adj_2.append(reg2_init.rsquared_adj) - BIC_1.append(reg1_init.bic) - BIC_2.append(reg2_init.bic) - AIC_1.append(reg1_init.aic) - AIC_2.append(reg2_init.aic) - F_pval_1.append(reg1_init.f_pvalue) - F_pval_2.append(reg2_init.f_pvalue) + regs_init = [0 for j in range(len(self.omics))] + for j in range(len(self.omics)): + regs_init[j] = sm.OLS( + self.y, + sm.add_constant(safe_sparse_dot(self.omics[j], Hs_0[j].T)) + ).fit() + + for j in range(len(self.omics)): + for k in range(K): + bets[j][k].append(regs_init[j].params[k+1]) + R2adjs[j].append(regs_init[j].rsquared_adj) + BICs[j].append(regs_init[j].bic) + AICs[j].append(regs_init[j].aic) + F_pvals[j].append(regs_init[j].f_pvalue) # Show the initial global loss value if self.verbose: print("Error after initialization step: %f" % (error[0])) # To keep track of the choice of etas - eta1, eta2 = ([] for i in range(2)) - eta1.append(self.params["eta1"]) - eta2.append(self.params["eta2"]) + etas_init = [self.params["eta"] for j in range(len(self.omics))] + etas = [[] for j in range(len(self.omics))] + for j in range(len(self.omics)): + etas[j].append(etas_init[j]) - # Begin the optimization,k + # Begin the optimization start_time = process_time() for n_iter in range(1, self.max_iter + 1): @@ -1239,33 +1106,26 @@ class NMFProfiler: if self.solver == "analytical": ( W, - H1, - H2, - Beta1, - Beta2, - self.params["eta1"], - self.params["eta2"], - margH1, - margH2, - gradH1, - gradH2, + Hs, + Betas, + current_etas, + margHs, + gradHs, ) = _analytic_solver( - self.omic1, - self.omic2, + self.omics, Y, W, - H1, - H2, - Beta1, - Beta2, + Hs, + Betas, self.params, + current_etas=[ + etas[j][-1] for j in range(len(self.omics)) + ], as_sklearn=self.as_sklearn, backtrack=self.backtrack, max_iter_back=self.max_iter_back, - H1_loss=margH1, - H2_loss=margH2, - H1_grad=gradH1, - H2_grad=gradH2, + Hlosses=margHs, + Hgrads=gradHs, K=K, K_gp=K_gp, verbose=self.verbose, @@ -1273,22 +1133,19 @@ class NMFProfiler: # ... or Autograd solver else: - W, H1, H2, Beta1, Beta2 = _autograd_solver() + W, Hs, Betas = _autograd_solver() # Keep track of optimal etas - eta1.append(self.params["eta1"]) - eta2.append(self.params["eta2"]) + for j in range(len(self.omics)): + etas[j].append(current_etas[j]) # Compute the new loss as well as specific terms loss_ = _computeF( - self.omic1, - self.omic2, + self.omics, Y, W, - H1, - H2, - Beta1, - Beta2, + Hs, + Betas, self.params["mu"], self.params["lambda"], self.params["gamma"], @@ -1297,33 +1154,27 @@ class NMFProfiler: ) error.append(loss_[0]) nb_iters.append(n_iter) - distort1.append(loss_[1]) - distort2.append(loss_[2]) - sparsity1.append(loss_[3]) - sparsity2.append(loss_[4]) - regul.append(loss_[5]) - pred1.append(loss_[6]) - pred2.append(loss_[7]) + regul.append(loss_[3]) + for j in range(len(self.omics)): + distorts[j].append(loss_[1][j]) + sparsitys[j].append(loss_[2][j]) + preds[j].append(loss_[4][j]) # Monitor the LDA part - reg1 = sm.OLS( - self.y, sm.add_constant(safe_sparse_dot(self.omic1, H1.T)) - ).fit() - reg2 = sm.OLS( - self.y, sm.add_constant(safe_sparse_dot(self.omic2, H2.T)) - ).fit() - bet1_1.append(reg1.params[1]) - bet2_1.append(reg1.params[2]) - bet1_2.append(reg2.params[1]) - bet2_2.append(reg2.params[2]) - R2adj_1.append(reg1.rsquared_adj) - R2adj_2.append(reg2.rsquared_adj) - BIC_1.append(reg1.bic) - BIC_2.append(reg2.bic) - AIC_1.append(reg1.aic) - AIC_2.append(reg2.aic) - F_pval_1.append(reg1.f_pvalue) - F_pval_2.append(reg2.f_pvalue) + regs = [0 for j in range(len(self.omics))] + for j in range(len(self.omics)): + regs[j] = sm.OLS( + self.y, + sm.add_constant(safe_sparse_dot(self.omics[j], Hs[j].T)) + ).fit() + + for j in range(len(self.omics)): + for k in range(K): + bets[j][k].append(regs[j].params[k+1]) + R2adjs[j].append(regs[j].rsquared_adj) + BICs[j].append(regs[j].bic) + AICs[j].append(regs[j].aic) + F_pvals[j].append(regs[j].f_pvalue) # Every 10 iterations, if tol is still strictly positive and # verbose == True, compute the loss value @@ -1368,149 +1219,123 @@ class NMFProfiler: # -------------------------------- self.n_iter = n_iter self.W = W - self.H1 = H1 - self.H2 = H2 - self.Beta1 = Beta1 - self.Beta2 = Beta2 + self.Hs = Hs + self.Betas = Betas # Keep track of final Hj gradients and build the Hj gradients matrix # (following up their evolution during optimization) # ------------------------------------------------------------------ - gradH1.append( - multi_dot([W.T, W, H1]) - + ( - self.params["gamma"] - * multi_dot( - [ - np.transpose(Beta1[newaxis]), - Beta1[newaxis], - H1, - self.omic1.T, - self.omic1, - ] - ) - ) - - ( - safe_sparse_dot(W.T, self.omic1) + for j in range(len(self.omics)): + gradHs[j].append( + multi_dot([W.T, W, Hs[j]]) + ( self.params["gamma"] * multi_dot( [ - np.transpose(Beta1[newaxis]), - self.y[newaxis], - self.omic1 + np.transpose(Betas[j][newaxis]), + Betas[j][newaxis], + Hs[j], + self.omics[j].T, + self.omics[j], ] ) ) - ) - ) - gradH2.append( - multi_dot([W.T, W, H2]) - + ( - self.params["gamma"] - * multi_dot( - [ - np.transpose(Beta2[newaxis]), - Beta2[newaxis], - H2, - self.omic2.T, - self.omic2, - ] - ) - ) - - ( - safe_sparse_dot(W.T, self.omic2) - + ( - self.params["gamma"] - * multi_dot( - [np.transpose(Beta2[newaxis]), - self.y[newaxis], self.omic2] + - ( + safe_sparse_dot(W.T, self.omics[j]) + + ( + self.params["gamma"] + * multi_dot( + [ + np.transpose(Betas[j][newaxis]), + self.y[newaxis], + self.omics[j] + ] + ) ) ) ) - ) - grads = np.hstack( - ( - np.vstack([norm(i) ** 2 for i in gradH1]), - np.vstack([norm(i) ** 2 for i in gradH2]), - ) + grads = np.array( + [ + [ + norm(i) ** 2 for i in gradHs[j] + ] for j in range(len(self.omics)) + ] + ).T + self.df_grads = pd.DataFrame( + grads, + columns=[f'grad_H{j+1}' for j in range(len(self.omics))] ) - self.df_grads = pd.DataFrame(grads, columns=["grad_H1", "grad_H2"]) # Build the error terms matrix # ---------------------------- error_terms = np.hstack( - ( + [ np.vstack(nb_iters), - np.vstack(distort1), - np.vstack(distort2), - np.vstack(sparsity1), - np.vstack(sparsity2), + np.array(distorts).T, + np.array(sparsitys).T, np.vstack(regul), - np.vstack(pred1), - np.vstack(pred2), + np.array(preds).T, np.vstack(error), - ) + ] ) self.df_errors = pd.DataFrame( error_terms, - columns=[ - "iter", - "distort1", - "distort2", - "sparsity1", - "sparsity2", - "regul", - "pred1", - "pred2", - "loss", - ], + columns=list(itertools.chain.from_iterable([ + ["iter"], + [f"distort{j+1}" for j in range(len(self.omics))], + [f"sparsity{j+1}" for j in range(len(self.omics))], + ["regul"], + [f"pred{j+1}" for j in range(len(self.omics))], + ["loss"], + ])), ) # Build the LDA performance matrix # -------------------------------- - LDA_perf = np.hstack( - ( - np.vstack(nb_iters), - np.vstack(bet1_1), - np.vstack(bet1_2), - np.vstack(R2adj_1), - np.vstack(BIC_1), - np.vstack(AIC_1), - np.vstack(F_pval_1), - np.vstack(bet1_2), - np.vstack(bet2_2), - np.vstack(R2adj_2), - np.vstack(BIC_2), - np.vstack(AIC_2), - np.vstack(F_pval_2), + LDA_perf = np.vstack(nb_iters) + for j in range(len(self.omics)): + bets_j = np.array(bets[j]).T + perf_j = np.array( + [ + R2adjs[j], + BICs[j], + AICs[j], + F_pvals[j], + ] + ).T + LDA_perf = np.hstack( + [LDA_perf, bets_j, perf_j] ) - ) self.df_ldaperf = pd.DataFrame( LDA_perf, - columns=[ - "iter", - "Comp.1 coef (omic1)", - "Comp.2 coef (omic1)", - "R2 Adjusted (omic1)", - "BIC (omic1)", - "AIC (omic1)", - "F-statistic p-value (omic1)", - "Comp.1 coef (omic2)", - "Comp.2 coef (omic2)", - "R2 Adjusted (omic2)", - "BIC (omic2)", - "AIC (omic2)", - "F-statistic p-value (omic2)", - ], + columns=list(itertools.chain.from_iterable([ + ["iter"], + list(itertools.chain.from_iterable([[ + *[f"Comp.{k+1} coef (omic{j+1})" for k in range(K)], + f"R2 Adjusted (omic{j+1})", + f"BIC (omic{j+1})", + f"AIC (omic{j+1})", + f"F-statistic p-value (omic{j+1})" + ] for j in range(len(self.omics))])), + ])), ) # Build the etas matrix # (following up etas evolution during optimization) # ------------------------------------------------- - etas = np.hstack((np.vstack(eta1), np.vstack(eta2))) - self.df_etas = pd.DataFrame(etas, columns=["eta_omic1", "eta_omic2"]) + self.df_etas = pd.DataFrame( + np.array(etas).T, + columns=[f"eta_omic{j+1}" for j in range(len(self.omics))] + ) + + # Build the correspondance matrix between + # original levels and their associated integer in y + # ------------------------------------------------- + self.df_y = pd.DataFrame( + {"Original_y": y_init, + "Recoded_y": self.y} + ) return self @@ -1521,25 +1346,25 @@ class NMFProfiler: Params ------ :new_ind: list. List of arrays containing values of - features from omic1 and omic2 for a new sample. + features from omicj for a new sample. Values ------ :group: list. Predicted group (one of 0 or 1) for the new sample in each omic. - :proj1: array. Projection onto H1 - :proj2: array. Projection onto H2 + :projs: list of arrays. Projection onto each Hj matrix """ - # Convert to right format - new_ind_X1 = new_ind[0][newaxis] - new_ind_X2 = new_ind[1][newaxis] + # Initialize projection list and compute + projs = [] + for j in len(new_ind): + # Convert to right format + new_ind_Xj = new_ind[j][newaxis] - # Compute projections of the new samples onto dictionary matrices - proj1 = safe_sparse_dot(new_ind_X1, self.H1.T) - proj2 = safe_sparse_dot(new_ind_X2, self.H2.T) + # Compute projections of the new samples onto dictionary matrices + projs.append(safe_sparse_dot(new_ind_Xj, self.Hs[j].T)) # For each omic, find which component gave the highest score - group = [proj1.argmax(), proj2.argmax()] + group = [projs[j].argmax() for j in len(new_ind)] # Compute global group value group_value = int(np.average(group)) @@ -1549,7 +1374,7 @@ class NMFProfiler: profile of group {group_value}." ) - res = {"group": group, "proj1": proj1, "proj2": proj2} + res = {"group": group, "projs": projs} return res @@ -1566,19 +1391,22 @@ class NMFProfiler: ------ Return a barplot of the different error terms. """ - data_barplot = [ - ["reconstruction omic1", self.df_errors.iloc[-1, 1]], - ["reconstruction omic2", self.df_errors.iloc[-1, 2]], - ["l2 penalty on W", self.df_errors.iloc[-1, 5]], - ["l1 penalty on H1", self.df_errors.iloc[-1, 3]], - ["l1 penalty on H2", self.df_errors.iloc[-1, 4]], - ["supervised part omic1", self.df_errors.iloc[-1, 6]], - ["supervised part omic2", self.df_errors.iloc[-1, 7]], - ] + J = len(self.omics) + data_barplot = np.array([ + [ + *[f"reconstruction omic{j+1}" for j in range(J)], + *[f"l1 penalty on H{j+1}" for j in range(J)], + "l2 penalty on W", + *[f"supervised part omic{j+1}" for j in range(J)], + ], + self.df_errors.iloc[-1, 1:-1].tolist(), + ]).T df_bar = pd.DataFrame(data_barplot, columns=["part", "value"]) + df_bar["value"] = df_bar["value"].astype(float) plt.figure(figsize=(width, height)) - sns.barplot(data=df_bar, x="part", y="value") + g = sns.barplot(data=df_bar, x="part", y="value") + g.set(xlabel=None) plt.savefig(str(path + "BarplotErrors.png")) plt.show() @@ -1596,14 +1424,18 @@ class NMFProfiler: ------ Return a lineplot. """ + temppalette = ["blue", "orange", "green", "pink", "yellow", "red"] + if obj_to_check == "etas": plt.figure(figsize=(width, height)) - sns.lineplot(data=self.df_etas, palette=["blue", "orange"]) + sns.lineplot(data=self.df_etas, + palette=temppalette[0:self.df_etas.shape[1]]) plt.show() elif obj_to_check == "gradients": plt.figure(figsize=(width, height)) - sns.lineplot(data=self.df_grads, palette=["blue", "orange"]) + sns.lineplot(data=self.df_grads, + palette=temppalette[0:self.df_etas.shape[1]]) plt.show() else: @@ -1613,13 +1445,13 @@ class NMFProfiler: "from NMFProfiler can be plotted with this method." ) - def heatmap(self, obj_to_viz, width, height, path): + def heatmap(self, obj_to_viz, width, height, path, omic_number=None): """Visualize any matrix of X^j, W, H^j with a heatmap. Params ------ - :obj_to_viz: str. One of {`'omic1'`, `'omic2'`, - `'W'`, `'H1'`, `'H2'`}. + :obj_to_viz: str. One of {`'omic'`, `'W'`, `'H'`}. + :omic_number: int. Number from 1 to J (max. number of omics). :width: int. Width of the figure (in units by default). :height: int. Height of the figure (in units by default). :path: str. Location to save the figure. @@ -1630,22 +1462,28 @@ class NMFProfiler: """ plt.figure(figsize=(width, height)) - if obj_to_viz == "omic1": - sns.heatmap(pd.DataFrame(self.omic1), cmap="viridis") - elif obj_to_viz == "omic2": - sns.heatmap(pd.DataFrame(self.omic2), cmap="viridis") - elif obj_to_viz == "W": + if obj_to_viz == "W": sns.heatmap(pd.DataFrame(self.W), cmap="viridis") - elif obj_to_viz == "H1": - sns.heatmap(pd.DataFrame(self.H1), cmap="viridis") - elif obj_to_viz == "H2": - sns.heatmap(pd.DataFrame(self.H2), cmap="viridis") + elif obj_to_viz == "omic": + sns.heatmap( + pd.DataFrame(self.omics[(omic_number-1)]), + cmap="viridis" + ) + elif obj_to_viz == "H": + sns.heatmap(pd.DataFrame(self.Hs[(omic_number-1)]), cmap="viridis") else: raise Exception( "Cannot plot this object, please change 'obj_to_viz' input." - " Only 'omic1', 'omic2', 'W', 'H1' and 'H2' outputs " + " Only 'omic', 'W' and 'H' outputs " "from NMFProfiler can be plotted with this method." ) - plt.savefig(str(path + obj_to_viz + "_Heatmap.png")) + if obj_to_viz == "W": + plotname = str(path + obj_to_viz + "_Heatmap.png") + else: + plotname = str( + path + obj_to_viz + str(omic_number) + "_Heatmap.png" + ) + + plt.savefig(plotname) plt.show() diff --git a/nmfprofiler/utils/validate_inputs.py b/nmfprofiler/utils/validate_inputs.py index b3d7ec6bddf01a6d26adb7b3d9a2c2522a94ac56..7e221e8b3f7b7525f77854f473d819ab1d33a383 100644 --- a/nmfprofiler/utils/validate_inputs.py +++ b/nmfprofiler/utils/validate_inputs.py @@ -24,8 +24,7 @@ def _check_inputs_NMFProfiler(X): """ params_names = [ "alpha", - "eta1", - "eta2", + "eta", "gamma", "lambda", "m_back", @@ -43,34 +42,30 @@ def _check_inputs_NMFProfiler(X): 'nndsvdar' ] - if _check_type(X.omic1, np.ndarray) is False: - raise TypeError( - f"Received {type(X.omic1)}, " - "'omic1' should be a numpy.ndarray." - ) - - if _check_type(X.omic2, np.ndarray) is False: - raise TypeError( - f"Received {type(X.omic2)}, " - "'omic2' should be a numpy.ndarray." - ) - if X.omic1.shape[0] != X.omic2.shape[0]: # check dimensions - raise Exception( - "Both datasets should have the exact same samples, but " - f"omic1 has {X.omic1.shape[0]} samples " - f"and omic2 has {X.omic2.shape[0]} samples. " - "Please check datasets." - ) + for j in range(len(X.omics)): + if _check_type(X.omics[j], np.ndarray) is False: + raise TypeError( + f"Received {type(X.omics[j])}, " + f"item number {j+1} in 'omics' (i.e. omic{j+1}) " + "should be a numpy.ndarray." + ) + if X.omics[0].shape[0] != X.omics[j].shape[0]: # check dimensions + raise Exception( + "All datasets should have the exact same samples, but " + f"omic1 (item 1 in 'omics') has {X.omics[0].shape[0]} samples " + f"and omic{j+1} (item {j+1} in 'omics') has " + f"{X.omics[j].shape[0]} samples. Please check datasets." + ) if _check_type(X.y, np.ndarray) is False: raise TypeError( f"Received {type(X.y)}, " "'y' should be a 1D numpy.ndarray." ) - if X.y.shape[0] != X.omic1.shape[0]: # check dimensions + if X.y.shape[0] != X.omics[0].shape[0]: # check dimensions raise Exception( f"Group vector y has {X.y.shape[0]} samples but " - f"omic1 has {X.omic1.shape[0]} samples. " + f"omic1 (item 1 in 'omics') has {X.omics[0].shape[0]} samples. " "Please check datasets." ) if len(np.unique(X.y)) < 2: # check y values @@ -88,9 +83,9 @@ def _check_inputs_NMFProfiler(X): "One or more essential key-value pair(s) missing. " "See documentation." ) - if len(X.params) > 8: + if len(X.params) > 7: raise ValueError( - "Extra key-value pair(s), 'params' should be of length 8. " + "Extra key-value pair(s), 'params' should be of length 7. " "See documentation." ) if _check_type(X.params['alpha'], int) is False or X.params['alpha'] < 1: @@ -98,15 +93,10 @@ def _check_inputs_NMFProfiler(X): f"Received {X.params['alpha']} {type(X.params['alpha'])}, " "'alpha' should be a positive integer." ) - if _check_type(X.params['eta1'], float) is False or X.params['eta1'] <= 0: - raise TypeError( - f"Received {X.params['eta1']} {type(X.params['eta1'])}, " - "'eta1' should be a positive float." - ) - if _check_type(X.params['eta2'], float) is False or X.params['eta2'] <= 0: + if _check_type(X.params['eta'], float) is False or X.params['eta'] <= 0: raise TypeError( - f"Received {X.params['eta2']} {type(X.params['eta2'])}, " - "'eta2' should be a positive float." + f"Received {X.params['eta']} {type(X.params['eta'])}, " + "'eta' should be a positive float." ) if _check_type(X.params['gamma'], float) is False or X.params['gamma'] < 0: raise TypeError( diff --git a/pyproject.toml b/pyproject.toml index 13746f635a091831c380c8049ed27e4f2b6cd2fe..7ed7b8286cb47a6fa83b8219e189652a479292bd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,8 +22,8 @@ build-backend = "setuptools.build_meta" [project] name = "NMFProfiler" -version = "0.2.1" -description = "NMFProfiler: an integrative supervised Non-Negative Matrix Factorization to extract typical profiles of groupes of interest combining two different datasets." +version = "0.3.0" +description = "NMFProfiler: an integrative supervised Non-Negative Matrix Factorization to extract typical profiles of groups of interest combining two different datasets." readme = "README.md" authors = [ { name = "Aurélie Mercadié", email = "aurelie.mercadie@inrae.fr" }, diff --git a/tests/test_nmfprofiler.py b/tests/test_nmfprofiler.py index c096c558ef2f1822a559995cd4649d3718c0dabb..ab5bde3bea97345ae6f18b35e59dde305b28d2af 100644 --- a/tests/test_nmfprofiler.py +++ b/tests/test_nmfprofiler.py @@ -662,7 +662,7 @@ seed = 240709 def test_nmfprofiler_mu(): - model = NMFProfiler(omic1=omic1, omic2=omic2, y=y, seed=seed) + model = NMFProfiler(omics=[omic1, omic2], y=y, seed=seed) res = model.fit() assert ( np.round(res.df_errors.iloc[-1, -1], decimals=4) == 0.9604 @@ -671,8 +671,7 @@ def test_nmfprofiler_mu(): def test_nmfprofiler_prox(): model = NMFProfiler( - omic1=omic1, - omic2=omic2, + omics=[omic1, omic2], y=y, seed=seed, as_sklearn=False,