From 07edd874d1a99cfd1ec6020687f3367bf38c1911 Mon Sep 17 00:00:00 2001 From: Nathalie Vialaneix <nathalie.vialaneix@inrae.fr> Date: Fri, 23 Aug 2024 09:57:09 +0200 Subject: [PATCH 01/19] fixed a typo in project description --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 13746f6..ae29d4e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ build-backend = "setuptools.build_meta" [project] name = "NMFProfiler" version = "0.2.1" -description = "NMFProfiler: an integrative supervised Non-Negative Matrix Factorization to extract typical profiles of groupes of interest combining two different datasets." +description = "NMFProfiler: an integrative supervised Non-Negative Matrix Factorization to extract typical profiles of groups of interest combining two different datasets." readme = "README.md" authors = [ { name = "Aurélie Mercadié", email = "aurelie.mercadie@inrae.fr" }, -- GitLab From fb7b09a75b8a8d0836eb787f63351880f721efa4 Mon Sep 17 00:00:00 2001 From: Aurelie Mercadie <aurelie.mercadie@inrae.fr> Date: Wed, 20 Nov 2024 17:38:44 +0100 Subject: [PATCH 02/19] adapt _preprocess_data function to +2 omics (bug) --- nmfprofiler/nmfprofiler.py | 49 +++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/nmfprofiler/nmfprofiler.py b/nmfprofiler/nmfprofiler.py index 5b841ef..01a387b 100644 --- a/nmfprofiler/nmfprofiler.py +++ b/nmfprofiler/nmfprofiler.py @@ -935,23 +935,38 @@ class NMFProfiler: Apply a min-max normalization and divide by the square root of the number of features. """ - # First dataset - for i in range(self.omic1.shape[1]): - self.omic1[:, i] = ( - self.omic1[:, i] - np.min(self.omic1[:, i]) - ) / ( - np.max(self.omic1[:, i]) - np.min(self.omic1[:, i]) - ) - self.omic1 = self.omic1 * (1 / np.sqrt(self.omic1.shape[1])) - - # Second dataset - for i in range(self.omic2.shape[1]): - self.omic2[:, i] = ( - self.omic2[:, i] - np.min(self.omic2[:, i]) - ) / ( - np.max(self.omic2[:, i]) - np.min(self.omic2[:, i]) - ) - self.omic2 = self.omic2 * (1 / np.sqrt(self.omic2.shape[1])) + # # First dataset + # for i in range(self.omic1.shape[1]): + # self.omic1[:, i] = ( + # self.omic1[:, i] - np.min(self.omic1[:, i]) + # ) / ( + # np.max(self.omic1[:, i]) - np.min(self.omic1[:, i]) + # ) + # self.omic1 = self.omic1 * (1 / np.sqrt(self.omic1.shape[1])) + + # # Second dataset + # for i in range(self.omic2.shape[1]): + # self.omic2[:, i] = ( + # self.omic2[:, i] - np.min(self.omic2[:, i]) + # ) / ( + # np.max(self.omic2[:, i]) - np.min(self.omic2[:, i]) + # ) + # self.omic2 = self.omic2 * (1 / np.sqrt(self.omic2.shape[1])) + + omics = [self.omic1, self.omic2] + + # For each dataset + for j in range(len(omics)-1): + for i in range(omics[j].shape[1]): + omics[j][:, i] = ( + omics[j][:, i] - np.min(omics[j][:, i]) + ) / ( + np.max(omics[j][:, i]) - np.min(omics[j][:, i]) + ) + omics[j] = omics[j] * (1 / np.sqrt(omics[j].shape[1])) + + self.omic1 = omics[0] + self.omic2 = omics[1] def _initialize_w_h_beta(self): """Initialize matrices W, H^j and Beta^j. -- GitLab From d25c6b347c6187264e4396ae5fd2657e0b2b059d Mon Sep 17 00:00:00 2001 From: Aurelie Mercadie <aurelie.mercadie@inrae.fr> Date: Fri, 22 Nov 2024 16:08:54 +0100 Subject: [PATCH 03/19] generalized _update_W, _computeF, _analytic_solver and _preprocess_data functions --- nmfprofiler/nmfprofiler.py | 263 ++++++++++++++++--------------------- 1 file changed, 112 insertions(+), 151 deletions(-) diff --git a/nmfprofiler/nmfprofiler.py b/nmfprofiler/nmfprofiler.py index 01a387b..1fb8b91 100644 --- a/nmfprofiler/nmfprofiler.py +++ b/nmfprofiler/nmfprofiler.py @@ -61,12 +61,15 @@ def _update_W(W, omic1, omic2, H1, H2, mu): ------- Newly updated W. """ - B = ( - safe_sparse_dot(H1, H1.T) - + safe_sparse_dot(H2, H2.T) - + (mu * np.identity(W.shape[1])) - ) - C = safe_sparse_dot(omic1, H1.T) + safe_sparse_dot(omic2, H2.T) + omics = [omic1, omic2] + Hs = [H1, H2] + + B = mu * np.identity(W.shape[1]) + for j in range(len(Hs)): + B += safe_sparse_dot(Hs[j], Hs[j].T) + C = np.zeros((W.shape[0], W.shape[1])) + for j in range(len(Hs)): + C += safe_sparse_dot(omics[j], Hs[j].T) return multiply(W, divide(C, safe_sparse_dot(W, B))) @@ -243,24 +246,26 @@ def _computeF( Either the value of the objective function (i.e. the global loss) alone, or accompagned by each specific term to obtain it. """ - distort1 = 0.5 * (norm(omic1 - safe_sparse_dot(W, H1)) ** 2) - distort2 = 0.5 * (norm(omic2 - safe_sparse_dot(W, H2)) ** 2) + omics = [omic1, omic2] + Hs = [H1, H2] + Betas = [Beta1, Beta2] + regul = (mu / 2) * np.trace(safe_sparse_dot(W.T, W)) - sparse1 = lambdA * norm(H1, 1) - sparse2 = lambdA * norm(H2, 1) - pred1 = (gamma / 2) * ( - norm(Y - multi_dot([omic1, H1.T[:, 0:K_gp], - np.diag(Beta1[0:K_gp])])) ** 2 - ) - pred2 = (gamma / 2) * ( - norm(Y - multi_dot([omic2, H2.T[:, 0:K_gp], - np.diag(Beta2[0:K_gp])])) ** 2 - ) + distort, sparse, pred = ([] for i in range(3)) + for j in range(len(omics)): + distort.append(0.5 * (norm(omics[j] - safe_sparse_dot(W, Hs[j])) ** 2)) + sparse.append(lambdA * norm(Hs[j], 1)) + pred.append( + (gamma / 2) * ( + norm(Y - multi_dot([omics[j], Hs[j].T[:, 0:K_gp], + np.diag(Betas[j][0:K_gp])])) ** 2 + ) + ) - loss = distort1 + distort2 + regul + sparse1 + sparse2 + pred1 + pred2 + loss = sum(distort) + regul + sum(sparse) + sum(pred) if details: - res = [loss, distort1, distort2, sparse1, sparse2, regul, pred1, pred2] + res = sum([[loss], distort, sparse, [regul], pred], []) return loss if details is False else res @@ -501,131 +506,105 @@ def _analytic_solver( See (Fevotte, 2011) and sklearn.decomposition.NMF source code for choice of parameters. """ + omics = [omic1, omic2] + Hs = [H1, H2] + Betas = [Beta1, Beta2] + Hlosses = [H1_loss, H2_loss] + Hgrads = [H1_grad, H2_grad] + etas = [params["eta1"], params["eta2"]] # W update with new values of Hs - W_new = _update_W(W, omic1, omic2, H1, H2, params["mu"]) + W_new = _update_W(W, omics[0], omics[1], Hs[0], Hs[1], params["mu"]) # Hs updates (either MU or Proximal) - H1_new, H1_grad = _update_H( - H1, - W_new, - omic1, - Beta1, - Y, - params["gamma"], - params["eta1"], - params["lambda"], - as_sklearn=as_sklearn, - grad=True, - H_grad=H1_grad, - ) - H2_new, H2_grad = _update_H( - H2, - W_new, - omic2, - Beta2, - Y, - params["gamma"], - params["eta2"], - params["lambda"], - as_sklearn=as_sklearn, - grad=True, - H_grad=H2_grad, - ) + Hs_new = [0] * len(omics) + for j in range(len(omics)): + Hs_new[j], Hgrads[j] = _update_H( + Hs[j], + W_new, + omics[j], + Betas[j], + Y, + params["gamma"], + etas[j], + params["lambda"], + as_sklearn=as_sklearn, + grad=True, + H_grad=Hgrads[j], + ) if backtrack and not as_sklearn: - # LineSearch for H1 - H1_loss.append( - _computeMargH( - omic1, - Y, - W_new, - H1_new, - Beta1, - params["gamma"], - params["lambda"], - K_gp + # LineSearch for each H + for j in range(len(omics)): + Hlosses[j].append( + _computeMargH( + omics[j], + Y, + W_new, + Hs_new[j], + Betas[j], + params["gamma"], + params["lambda"], + K_gp + ) ) - ) - H1_new, H1_loss, params["eta1"] = _linesearch( - H_old=H1, - H=H1_new, - W=W_new, - omic=omic1, - Beta=Beta1, - Y=Y, - gamma=params["gamma"], - lambdA=params["lambda"], - current_eta=params["eta1"], - H_loss=H1_loss, - alpha=params["alpha"], - sigma=params["sigma"], - m_back=params["m_back"], - max_iter_back=max_iter_back, - K_gp=K_gp, - verbose=verbose, - ) - # LineSearch for H2 - H2_loss.append( - _computeMargH( - omic2, - Y, - W_new, - H2_new, - Beta2, - params["gamma"], - params["lambda"], - K_gp + Hs_new[j], Hlosses[j], etas[j] = _linesearch( + H_old=Hs[j], + H=Hs_new[j], + W=W_new, + omic=omics[j], + Beta=Betas[j], + Y=Y, + gamma=params["gamma"], + lambdA=params["lambda"], + current_eta=etas[j], + H_loss=Hlosses[j], + alpha=params["alpha"], + sigma=params["sigma"], + m_back=params["m_back"], + max_iter_back=max_iter_back, + K_gp=K_gp, + verbose=verbose, ) - ) - H2_new, H2_loss, params["eta2"] = _linesearch( - H_old=H2, - H=H2_new, - W=W_new, - omic=omic2, - Beta=Beta2, - Y=Y, - gamma=params["gamma"], - lambdA=params["lambda"], - current_eta=params["eta2"], - H_loss=H2_loss, - alpha=params["alpha"], - sigma=params["sigma"], - m_back=params["m_back"], - max_iter_back=max_iter_back, - K_gp=K_gp, - verbose=verbose, - ) # Betas updates with new values of Hs # Compute each regression coef. of each component separately # and gather them in a unique vector - Beta1_new = np.array( - [ - _update_Beta( - omic1, H1_new[0, :], Y[:, 0], params["gamma"], Beta1[0] - ), - _update_Beta( - omic1, H1_new[1, :], Y[:, 1], params["gamma"], Beta1[1] - ), - ] - ) - - Beta2_new = np.array( - [ - _update_Beta( - omic2, H2_new[0, :], Y[:, 0], params["gamma"], Beta2[0] - ), - _update_Beta( - omic2, H2_new[1, :], Y[:, 1], params["gamma"], Beta2[1] - ), - ] - ) + Betas_new = [0] * len(omics) + for j in range(len(omics)): + Betas_new[j] = np.array( + [ + _update_Beta( + omics[j], + Hs_new[j][0, :], + Y[:, 0], + params["gamma"], + Betas[j][0] + ), + _update_Beta( + omics[j], + Hs_new[j][1, :], + Y[:, 1], + params["gamma"], + Betas[j][1] + ), + ] + ) - # Put to zero all coefficients linked to components k when k > K_gp - if K_gp < K: - Beta1_new[K_gp:] = 0 - Beta2_new[K_gp:] = 0 + # Put to zero all coefficients linked to components k when k > K_gp + if K_gp < K: + Betas_new[j][K_gp:] = 0 + + H1_new = Hs_new[0] + H2_new = Hs_new[1] + Beta1_new = Betas_new[0] + Beta2_new = Betas_new[1] + params["eta1"] = etas[0] + params["eta2"] = etas[1] + H1_loss = Hlosses[0] + H2_loss = Hlosses[1] + H1_grad = Hgrads[0] + H2_grad = Hgrads[1] return ( W_new, @@ -935,28 +914,10 @@ class NMFProfiler: Apply a min-max normalization and divide by the square root of the number of features. """ - # # First dataset - # for i in range(self.omic1.shape[1]): - # self.omic1[:, i] = ( - # self.omic1[:, i] - np.min(self.omic1[:, i]) - # ) / ( - # np.max(self.omic1[:, i]) - np.min(self.omic1[:, i]) - # ) - # self.omic1 = self.omic1 * (1 / np.sqrt(self.omic1.shape[1])) - - # # Second dataset - # for i in range(self.omic2.shape[1]): - # self.omic2[:, i] = ( - # self.omic2[:, i] - np.min(self.omic2[:, i]) - # ) / ( - # np.max(self.omic2[:, i]) - np.min(self.omic2[:, i]) - # ) - # self.omic2 = self.omic2 * (1 / np.sqrt(self.omic2.shape[1])) - omics = [self.omic1, self.omic2] # For each dataset - for j in range(len(omics)-1): + for j in range(len(omics)): for i in range(omics[j].shape[1]): omics[j][:, i] = ( omics[j][:, i] - np.min(omics[j][:, i]) -- GitLab From d62779f0d93ff3439fc3ba1260e8ddb171c49378 Mon Sep 17 00:00:00 2001 From: Aurelie Mercadie <aurelie.mercadie@inrae.fr> Date: Fri, 22 Nov 2024 16:49:06 +0100 Subject: [PATCH 04/19] generalized _initialize_w_h_beta function --- nmfprofiler/nmfprofiler.py | 99 ++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 52 deletions(-) diff --git a/nmfprofiler/nmfprofiler.py b/nmfprofiler/nmfprofiler.py index 1fb8b91..6d57a39 100644 --- a/nmfprofiler/nmfprofiler.py +++ b/nmfprofiler/nmfprofiler.py @@ -939,6 +939,7 @@ class NMFProfiler: ---- Based on _initialize_nmf of sklearn.decomposition.NMF. """ + omicstemp = [self.omic1, self.omic2] # Extract K, the number of latent components, as equal to # the number of distinct groups in y @@ -948,8 +949,10 @@ class NMFProfiler: K_gp = len(np.unique(self.y)) # For W, H1 and H2: + # 1.0. Initialize H0s list + H0s = [0] * len(omicstemp) # 1.1. Concatenate omics data sets - omics = np.concatenate((self.omic1, self.omic2), axis=1) + omics = np.concatenate(omicstemp, axis=1) # 1.2. Initialize using sklearn function if self.init_method == "random2": # 1.2a. Initialize W with both omics and Hj @@ -960,29 +963,25 @@ class NMFProfiler: init="random", random_state=self.seed ) - *_, H1_0 = _initialize_nmf( - X=self.omic1, - n_components=K, - init="random", - random_state=self.seed - ) - *_, H2_0 = _initialize_nmf( - X=self.omic2, - n_components=K, - init="random", - random_state=self.seed - ) - del _ + for j in range(len(omicstemp)): + *_, H0s[j] = _initialize_nmf( + X=omicstemp[j], + n_components=K, + init="random", + random_state=self.seed + ) + del _ elif self.init_method == "random3": # 1.2b. FOR IDENTICAL OMICS DATA SETS - Initialize W # with both omics, H1 with omic1 and H2 as H1 (random) - W0, H1_0 = _initialize_nmf( - X=self.omic1, + W0, H0s[0] = _initialize_nmf( + X=omicstemp[0], n_components=K, init="random", random_state=self.seed ) - H2_0 = H1_0.copy() + for j in range(1, len(omicstemp)): + H0s[j] = H0s[0].copy() elif self.init_method == "nndsvd2": # 1.2c. Initialize W with both omics and Hj # with specific omic (nndsvd) @@ -992,36 +991,27 @@ class NMFProfiler: init="nndsvd", random_state=self.seed ) - *_, H1_0 = _initialize_nmf( - X=self.omic1, - n_components=K, - init="nndsvd", - random_state=self.seed - ) - *_, H2_0 = _initialize_nmf( - X=self.omic2, - n_components=K, - init="nndsvd", - random_state=self.seed - ) - del _ + for j in range(len(omicstemp)): + *_, H0s[j] = _initialize_nmf( + X=omicstemp[j], + n_components=K, + init="nndsvd", + random_state=self.seed + ) + del _ elif self.init_method == "nndsvd3": # 1.2d. Initialize W with each omic then take the mean and # initialize Hj with specific omic (nndsvd) - W_a, H1_0 = _initialize_nmf( - X=self.omic1, - n_components=K, - init="nndsvd", - random_state=self.seed - ) - W_b, H2_0 = _initialize_nmf( - X=self.omic2, - n_components=K, - init="nndsvd", - random_state=self.seed - ) - W0 = (1 / 2) * (W_a + W_b) - del W_a, W_b + W_js = [0] * len(omicstemp) + for j in range(len(omicstemp)): + W_js[j], H0s[j] = _initialize_nmf( + X=omicstemp[j], + n_components=K, + init="nndsvd", + random_state=self.seed + ) + W0 = (1 / len(omicstemp)) * (sum(W_js)) + del W_js else: # 1.2e. Initialize both with all omics, using whatever method # available in _initialize_nmf(). See help. @@ -1032,18 +1022,23 @@ class NMFProfiler: random_state=self.seed ) # 1.2e. Split H matrix - H1_0, H2_0 = np.split( + H0s = np.split( ary=H0, indices_or_sections=[self.omic1.shape[1]], axis=1 ) del H0 - # For Beta1 and Beta2: - Beta1_0 = np.repeat(1, K) - Beta2_0 = np.repeat(1, K) - # Put to zero all coefficients linked to component k when k > K_gp - if K_gp < K: - Beta1_0[K_gp:] = 0 - Beta2_0[K_gp:] = 0 + # For Betas: + Beta0s = [0] * len(omicstemp) + for j in range(len(omicstemp)): + Beta0s[j] = np.repeat(1, K) + # Put to zero all coefficients linked to component k when k > K_gp + if K_gp < K: + Beta0s[j][K_gp:] = 0 + + H1_0 = H0s[0] + H2_0 = H0s[1] + Beta1_0 = Beta0s[0] + Beta2_0 = Beta0s[1] return W0, H1_0, H2_0, Beta1_0, Beta2_0 -- GitLab From 3d6067193a0cd474c1b51fdd0d69708a87af3bd4 Mon Sep 17 00:00:00 2001 From: Aurelie Mercadie <aurelie.mercadie@inrae.fr> Date: Fri, 22 Nov 2024 17:48:26 +0100 Subject: [PATCH 05/19] generalized predict, evolplot, heatmap functions (unchecked) --- nmfprofiler/nmfprofiler.py | 62 ++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 26 deletions(-) diff --git a/nmfprofiler/nmfprofiler.py b/nmfprofiler/nmfprofiler.py index 6d57a39..9e1dac3 100644 --- a/nmfprofiler/nmfprofiler.py +++ b/nmfprofiler/nmfprofiler.py @@ -1498,19 +1498,21 @@ class NMFProfiler: ------ :group: list. Predicted group (one of 0 or 1) for the new sample in each omic. - :proj1: array. Projection onto H1 - :proj2: array. Projection onto H2 + :projs: list of arrays. Projection onto each Hj matrix """ - # Convert to right format - new_ind_X1 = new_ind[0][newaxis] - new_ind_X2 = new_ind[1][newaxis] + Hs = [self.H1, self.H2] - # Compute projections of the new samples onto dictionary matrices - proj1 = safe_sparse_dot(new_ind_X1, self.H1.T) - proj2 = safe_sparse_dot(new_ind_X2, self.H2.T) + # Initialize projection list and compute + projs = [] + for j in len(new_ind): + # Convert to right format + new_ind_Xj = new_ind[j][newaxis] + + # Compute projections of the new samples onto dictionary matrices + projs.append(safe_sparse_dot(new_ind_Xj, Hs[j].T)) # For each omic, find which component gave the highest score - group = [proj1.argmax(), proj2.argmax()] + group = [projs[j].argmax() for j in len(new_ind)] # Compute global group value group_value = int(np.average(group)) @@ -1520,7 +1522,7 @@ class NMFProfiler: profile of group {group_value}." ) - res = {"group": group, "proj1": proj1, "proj2": proj2} + res = {"group": group, "projs": projs} return res @@ -1567,14 +1569,18 @@ class NMFProfiler: ------ Return a lineplot. """ + temppalette = ["blue", "orange", "green", "pink", "yellow", "red"] + if obj_to_check == "etas": plt.figure(figsize=(width, height)) - sns.lineplot(data=self.df_etas, palette=["blue", "orange"]) + sns.lineplot(data=self.df_etas, + palette=temppalette[0:self.df_etas.shape[1]]) plt.show() elif obj_to_check == "gradients": plt.figure(figsize=(width, height)) - sns.lineplot(data=self.df_grads, palette=["blue", "orange"]) + sns.lineplot(data=self.df_grads, + palette=temppalette[0:self.df_etas.shape[1]]) plt.show() else: @@ -1584,13 +1590,13 @@ class NMFProfiler: "from NMFProfiler can be plotted with this method." ) - def heatmap(self, obj_to_viz, width, height, path): + def heatmap(self, obj_to_viz, width, height, path, omic_number=None): """Visualize any matrix of X^j, W, H^j with a heatmap. Params ------ - :obj_to_viz: str. One of {`'omic1'`, `'omic2'`, - `'W'`, `'H1'`, `'H2'`}. + :obj_to_viz: str. One of {`'omic'`, `'W'`, `'H'`}. + :omic_number: int. Number from 1 to J (max. number of omics). :width: int. Width of the figure (in units by default). :height: int. Height of the figure (in units by default). :path: str. Location to save the figure. @@ -1601,22 +1607,26 @@ class NMFProfiler: """ plt.figure(figsize=(width, height)) - if obj_to_viz == "omic1": - sns.heatmap(pd.DataFrame(self.omic1), cmap="viridis") - elif obj_to_viz == "omic2": - sns.heatmap(pd.DataFrame(self.omic2), cmap="viridis") - elif obj_to_viz == "W": + omics = [self.omic1, self.omic2] + Hs = [self.H1, self.H2] + + if obj_to_viz == "W": sns.heatmap(pd.DataFrame(self.W), cmap="viridis") - elif obj_to_viz == "H1": - sns.heatmap(pd.DataFrame(self.H1), cmap="viridis") - elif obj_to_viz == "H2": - sns.heatmap(pd.DataFrame(self.H2), cmap="viridis") + elif obj_to_viz == "omic": + sns.heatmap(pd.DataFrame(omics[(omic_number-1)]), cmap="viridis") + elif obj_to_viz == "H": + sns.heatmap(pd.DataFrame(Hs[(omic_number-1)]), cmap="viridis") else: raise Exception( "Cannot plot this object, please change 'obj_to_viz' input." - " Only 'omic1', 'omic2', 'W', 'H1' and 'H2' outputs " + " Only 'omic', 'W' and 'H' outputs " "from NMFProfiler can be plotted with this method." ) - plt.savefig(str(path + obj_to_viz + "_Heatmap.png")) + if obj_to_viz == "W": + plotname = str(path + obj_to_viz + "_Heatmap.png") + else: + plotname = str(path + obj_to_viz + omic_number + "_Heatmap.png") + + plt.savefig(plotname) plt.show() -- GitLab From 225d4e1dba447c4d5a1fbab655f0a2e18cb877fe Mon Sep 17 00:00:00 2001 From: cebrouard <celinebrouard@MacBook-Pro-de-cebrouard.local> Date: Mon, 25 Nov 2024 17:01:21 +0100 Subject: [PATCH 06/19] started to modify fit --- nmfprofiler/nmfprofiler.py | 187 ++++++++++++++++++++++++------------- 1 file changed, 122 insertions(+), 65 deletions(-) diff --git a/nmfprofiler/nmfprofiler.py b/nmfprofiler/nmfprofiler.py index 9e1dac3..2c8e032 100644 --- a/nmfprofiler/nmfprofiler.py +++ b/nmfprofiler/nmfprofiler.py @@ -1062,13 +1062,18 @@ class NMFProfiler: # Pre-process datasets (with min-max and number of features) self._preprocess_data() + + omics = [self.omic1, self.omic2] + # Initialize matrices W0, H1_0, H2_0, Beta1_0, Beta2_0 = self._initialize_w_h_beta() + Hs_0 = [H1_0, H2_0] + Betas_0 = [Beta1_0, Beta2_0] self.W_init = W0 - self.H1_init = H1_0 - self.H2_init = H2_0 - self.Beta1_init = Beta1_0 - self.Beta2_init = Beta2_0 + self.H1_init = Hs_0[0] + self.H2_init = Hs_0[1] + self.Beta1_init = Betas_0[0] + self.Beta2_init = Betas_0[1] # Update lambda and gamma given sample size self._update_params() @@ -1078,13 +1083,17 @@ class NMFProfiler: # Create matrices and vectors to update using initialization # (and keep intact initialized matrices and vectors) - W, H1, H2, Beta1, Beta2 = ( - deepcopy(W0), - deepcopy(H1_0), - deepcopy(H2_0), - deepcopy(Beta1_0), - deepcopy(Beta2_0), - ) + #W, H1, H2, Beta1, Beta2 = ( + # deepcopy(W0), + # deepcopy(H1_0), + # deepcopy(H2_0), + # deepcopy(Beta1_0), + # deepcopy(Beta2_0), + #) + + W = deepcopy(W0) + Hs = [deepcopy(Hs_0[j]) for j in range(len(omics))] + Betas = [deepcopy(Betas_0[j]) for j in range(len(omics))] # Create lists of error terms to enrich iteratively ( @@ -1102,6 +1111,13 @@ class NMFProfiler: pred2, nb_iters, ) = ([] for i in range(13)) + + margHs = [margH1, margH2] + gradHs = [gradH1, gradH2] + distorts = [distort1, distort2] + sparsitys = [sparsity1, sparsity2] + preds = [pred1, pred2] + # Create lists of LDA performance indicators to enrich iteratively ( R2adj_1, @@ -1117,16 +1133,22 @@ class NMFProfiler: bet2_1, bet2_2, ) = ([] for i in range(12)) + + R2adjs = [R2adj_1, R2adj_2] + BICs = [BIC_1, BIC_2] + AICs = [AIC_1, AIC_2] + F_pvals = [F_pval_1, F_pval_2] + bets = [[bet1_1,bet1_2],[bet2_1,bet2_2]] loss_init = _computeF( - self.omic1, - self.omic2, + omics[0], + omics[1], Y, W0, - H1_0, - H2_0, - Beta1_0, - Beta2_0, + Hs_0[0], + Hs_0[1], + Betas_0[0], + Betas_0[1], self.params["mu"], self.params["lambda"], self.params["gamma"], @@ -1135,61 +1157,95 @@ class NMFProfiler: ) error.append(loss_init[0]) nb_iters.append(0) - distort1.append(loss_init[1]) - distort2.append(loss_init[2]) - sparsity1.append(loss_init[3]) - sparsity2.append(loss_init[4]) + distorts[0].append(loss_init[1]) + distorts[1].append(loss_init[2]) + sparsitys[0].append(loss_init[3]) + sparsitys[1].append(loss_init[4]) regul.append(loss_init[5]) - pred1.append(loss_init[6]) - pred2.append(loss_init[7]) + preds[0].append(loss_init[6]) + preds[1].append(loss_init[7]) + + # change computeF outputs for more than 2 omics + # Keep track of marginal objective functions in Hj with # initial matrices (necessary for linesearch first execution) - margH1.append( - _computeMargH( - self.omic1, - Y, - W0, - H1_0, - Beta1_0, - self.params["gamma"], - self.params["lambda"], - K_gp, - ) - ) - margH2.append( - _computeMargH( - self.omic2, - Y, - W0, - H2_0, - Beta2_0, - self.params["gamma"], - self.params["lambda"], - K_gp, + for j in range(len(omics)): + margHs[j].append( + _computeMargH( + omics[j], + Y, + W0, + Hs_0[j], + Betas_0[j], + self.params["gamma"], + self.params["lambda"], + K_gp, + ) ) - ) + # LDAs with initial matrices - reg1_init = sm.OLS( - self.y, sm.add_constant(safe_sparse_dot(self.omic1, H1_0.T)) - ).fit() - reg2_init = sm.OLS( - self.y, sm.add_constant(safe_sparse_dot(self.omic2, H2_0.T)) - ).fit() - - bet1_1.append(reg1_init.params[1]) - bet2_1.append(reg1_init.params[2]) - bet1_2.append(reg2_init.params[1]) - bet2_2.append(reg2_init.params[2]) - R2adj_1.append(reg1_init.rsquared_adj) - R2adj_2.append(reg2_init.rsquared_adj) - BIC_1.append(reg1_init.bic) - BIC_2.append(reg2_init.bic) - AIC_1.append(reg1_init.aic) - AIC_2.append(reg2_init.aic) - F_pval_1.append(reg1_init.f_pvalue) - F_pval_2.append(reg2_init.f_pvalue) + regs_init = [] + for j in range(len(omics)): + + regs_init[j] = sm.OLS( + self.y, sm.add_constant(safe_sparse_dot(omics[j], Hs_0[j].T)) + ).fit() + + for j in range(len(omics)): + bets[j][0].append(regs_init[j].params[1]) + bets[j][1].append(regs_init[j].params[2]) + R2adjs[j].append(regs_init[j].rsquared_adj) + BICs[j].append(regs_init[j].bic) + AICs[j].append(regs_init[j].aic) + F_pvals[j].append(regs_init[j].f_pvalue) + + #bet1_1.append(reg1_init.params[1]) + #bet2_1.append(reg1_init.params[2]) + #bet1_2.append(reg2_init.params[1]) + #bet2_2.append(reg2_init.params[2]) + #R2adj_1.append(reg1_init.rsquared_adj) + #R2adj_2.append(reg2_init.rsquared_adj) + #BIC_1.append(reg1_init.bic) + #BIC_2.append(reg2_init.bic) + #AIC_1.append(reg1_init.aic) + #AIC_2.append(reg2_init.aic) + #F_pval_1.append(reg1_init.f_pvalue) + #F_pval_2.append(reg2_init.f_pvalue) + + H1 = Hs[0] + H2 = Hs[1] + Beta1 = Betas[0] + Beta2 = Betas[1] + H1_0 = Hs_0[0] + H2_0 = Hs_0[1] + Beta1_0 = Betas_0[0] + Beta2_0 = Betas_0[1] + margH1 = margHs[0] + margH2 = margHs[1] + gradH1 = gradHs[0] + gradH2 = gradHs[1] + distort1 = distorts[0] + distort2 = distorts[1] + sparsity1 = sparsitys[0] + sparsity2 = sparsitys[1] + pred1 = preds[0] + pred2 = preds[1] + R2adj_1 = R2adjs[0] + R2adj_2 = R2adjs[1] + BIC_1 = BICS[0] + BIC_2 = BICS[1] + AIC_1 = AICs[0] + AIC_2 = AICs[1] + F_pval_1 = F_pvals[0] + F_pval_2 = F_pvals[1] + bet1_1 = bets[0][0] + bet1_2 = bets[0][1] + bet2_1 = bets[1][0] + bet2_2 = bets[1][1] + + # Show the initial global loss value if self.verbose: @@ -1199,6 +1255,7 @@ class NMFProfiler: eta1, eta2 = ([] for i in range(2)) eta1.append(self.params["eta1"]) eta2.append(self.params["eta2"]) + etas = [eta1,eta2] # Begin the optimization,k start_time = process_time() -- GitLab From 840c64322849193830d00807974fb5fb420ab027 Mon Sep 17 00:00:00 2001 From: Aurelie Mercadie <aurelie.mercadie@inrae.fr> Date: Tue, 26 Nov 2024 15:29:19 +0100 Subject: [PATCH 07/19] generalized _check_inputs_NMFProfiler function --- nmfprofiler/utils/validate_inputs.py | 57 +++++++++++++--------------- 1 file changed, 27 insertions(+), 30 deletions(-) diff --git a/nmfprofiler/utils/validate_inputs.py b/nmfprofiler/utils/validate_inputs.py index b3d7ec6..e817e12 100644 --- a/nmfprofiler/utils/validate_inputs.py +++ b/nmfprofiler/utils/validate_inputs.py @@ -43,34 +43,31 @@ def _check_inputs_NMFProfiler(X): 'nndsvdar' ] - if _check_type(X.omic1, np.ndarray) is False: - raise TypeError( - f"Received {type(X.omic1)}, " - "'omic1' should be a numpy.ndarray." - ) - - if _check_type(X.omic2, np.ndarray) is False: - raise TypeError( - f"Received {type(X.omic2)}, " - "'omic2' should be a numpy.ndarray." - ) - if X.omic1.shape[0] != X.omic2.shape[0]: # check dimensions - raise Exception( - "Both datasets should have the exact same samples, but " - f"omic1 has {X.omic1.shape[0]} samples " - f"and omic2 has {X.omic2.shape[0]} samples. " - "Please check datasets." - ) + omics = [X.omic1, X.omic2] + for j in range(len(omics)): + if _check_type(omics[j], np.ndarray) is False: + raise TypeError( + f"Received {type(omics[j])}, " + f"item number {j+1} in 'omics' (i.e. omic{j+1}) " + "should be a numpy.ndarray." + ) + if omics[0].shape[0] != omics[j].shape[0]: # check dimensions + raise Exception( + "All datasets should have the exact same samples, but " + f"omic1 (item 1 in 'omics') has {omics[0].shape[0]} samples " + f"and omic{j+1} (item {j+1} in 'omics') has " + f"{omics[j].shape[0]} samples. Please check datasets." + ) if _check_type(X.y, np.ndarray) is False: raise TypeError( f"Received {type(X.y)}, " "'y' should be a 1D numpy.ndarray." ) - if X.y.shape[0] != X.omic1.shape[0]: # check dimensions + if X.y.shape[0] != omics[0].shape[0]: # check dimensions raise Exception( f"Group vector y has {X.y.shape[0]} samples but " - f"omic1 has {X.omic1.shape[0]} samples. " + f"omic1 (item 1 in 'omics') has {omics[0].shape[0]} samples. " "Please check datasets." ) if len(np.unique(X.y)) < 2: # check y values @@ -98,16 +95,16 @@ def _check_inputs_NMFProfiler(X): f"Received {X.params['alpha']} {type(X.params['alpha'])}, " "'alpha' should be a positive integer." ) - if _check_type(X.params['eta1'], float) is False or X.params['eta1'] <= 0: - raise TypeError( - f"Received {X.params['eta1']} {type(X.params['eta1'])}, " - "'eta1' should be a positive float." - ) - if _check_type(X.params['eta2'], float) is False or X.params['eta2'] <= 0: - raise TypeError( - f"Received {X.params['eta2']} {type(X.params['eta2'])}, " - "'eta2' should be a positive float." - ) + + etas = [X.params['eta1'], X.params['eta2']] + for j in range(len(etas)): + if _check_type(etas[j], float) is False or etas[j] <= 0: + raise TypeError( + f"Received {etas[j]} {type(etas[j])}, " + f"eta {j+1} (item {j+1} in 'params['etas']') " + "should be a positive float." + ) + if _check_type(X.params['gamma'], float) is False or X.params['gamma'] < 0: raise TypeError( f"Received {X.params['gamma']} {type(X.params['gamma'])}, " -- GitLab From 152a3a28d11bf622d6fc8d82f9026ddc5704fd91 Mon Sep 17 00:00:00 2001 From: Aurelie Mercadie <aurelie.mercadie@inrae.fr> Date: Wed, 27 Nov 2024 10:50:24 +0100 Subject: [PATCH 08/19] generalized fit function (1st step) --- nmfprofiler/nmfprofiler.py | 370 ++++++++++++++----------------------- 1 file changed, 142 insertions(+), 228 deletions(-) diff --git a/nmfprofiler/nmfprofiler.py b/nmfprofiler/nmfprofiler.py index 2c8e032..336b9aa 100644 --- a/nmfprofiler/nmfprofiler.py +++ b/nmfprofiler/nmfprofiler.py @@ -1062,18 +1062,19 @@ class NMFProfiler: # Pre-process datasets (with min-max and number of features) self._preprocess_data() - - omics = [self.omic1, self.omic2] - + omics = [self.omic1, self.omic2] # *** + # Initialize matrices - W0, H1_0, H2_0, Beta1_0, Beta2_0 = self._initialize_w_h_beta() - Hs_0 = [H1_0, H2_0] - Betas_0 = [Beta1_0, Beta2_0] + W0, H1_0, H2_0, Beta1_0, Beta2_0 = self._initialize_w_h_beta() # *** + + Hs_0 = [H1_0, H2_0] # *** + Betas_0 = [Beta1_0, Beta2_0] # *** + self.W_init = W0 - self.H1_init = Hs_0[0] - self.H2_init = Hs_0[1] - self.Beta1_init = Betas_0[0] - self.Beta2_init = Betas_0[1] + self.H1_init = Hs_0[0] # *** + self.H2_init = Hs_0[1] # *** + self.Beta1_init = Betas_0[0] # *** + self.Beta2_init = Betas_0[1] # *** # Update lambda and gamma given sample size self._update_params() @@ -1083,20 +1084,12 @@ class NMFProfiler: # Create matrices and vectors to update using initialization # (and keep intact initialized matrices and vectors) - #W, H1, H2, Beta1, Beta2 = ( - # deepcopy(W0), - # deepcopy(H1_0), - # deepcopy(H2_0), - # deepcopy(Beta1_0), - # deepcopy(Beta2_0), - #) - W = deepcopy(W0) Hs = [deepcopy(Hs_0[j]) for j in range(len(omics))] Betas = [deepcopy(Betas_0[j]) for j in range(len(omics))] # Create lists of error terms to enrich iteratively - ( + ( # *** error, margH1, margH2, @@ -1111,15 +1104,15 @@ class NMFProfiler: pred2, nb_iters, ) = ([] for i in range(13)) - - margHs = [margH1, margH2] - gradHs = [gradH1, gradH2] - distorts = [distort1, distort2] - sparsitys = [sparsity1, sparsity2] - preds = [pred1, pred2] - + + margHs = [margH1, margH2] # *** + gradHs = [gradH1, gradH2] # *** + distorts = [distort1, distort2] # *** + sparsitys = [sparsity1, sparsity2] # *** + preds = [pred1, pred2] # *** + # Create lists of LDA performance indicators to enrich iteratively - ( + ( # *** R2adj_1, R2adj_2, BIC_1, @@ -1133,14 +1126,14 @@ class NMFProfiler: bet2_1, bet2_2, ) = ([] for i in range(12)) - - R2adjs = [R2adj_1, R2adj_2] - BICs = [BIC_1, BIC_2] - AICs = [AIC_1, AIC_2] - F_pvals = [F_pval_1, F_pval_2] - bets = [[bet1_1,bet1_2],[bet2_1,bet2_2]] - - loss_init = _computeF( + + R2adjs = [R2adj_1, R2adj_2] # *** + BICs = [BIC_1, BIC_2] # *** + AICs = [AIC_1, AIC_2] # *** + F_pvals = [F_pval_1, F_pval_2] # *** + bets = [[bet1_1, bet1_2], [bet2_1, bet2_2]] # *** + + loss_init = _computeF( # *** omics[0], omics[1], Y, @@ -1157,16 +1150,13 @@ class NMFProfiler: ) error.append(loss_init[0]) nb_iters.append(0) - distorts[0].append(loss_init[1]) - distorts[1].append(loss_init[2]) - sparsitys[0].append(loss_init[3]) - sparsitys[1].append(loss_init[4]) - regul.append(loss_init[5]) - preds[0].append(loss_init[6]) - preds[1].append(loss_init[7]) - - # change computeF outputs for more than 2 omics - + distorts[0].append(loss_init[1]) # *** + distorts[1].append(loss_init[2]) # *** + sparsitys[0].append(loss_init[3]) # *** + sparsitys[1].append(loss_init[4]) # *** + regul.append(loss_init[5]) # *** + preds[0].append(loss_init[6]) # *** + preds[1].append(loss_init[7]) # *** # Keep track of marginal objective functions in Hj with # initial matrices (necessary for linesearch first execution) @@ -1184,80 +1174,32 @@ class NMFProfiler: ) ) - # LDAs with initial matrices - regs_init = [] - for j in range(len(omics)): - + regs_init = [0] * len(omics) # *** + for j in range(len(omics)): # *** regs_init[j] = sm.OLS( self.y, sm.add_constant(safe_sparse_dot(omics[j], Hs_0[j].T)) ).fit() - for j in range(len(omics)): + for j in range(len(omics)): # *** bets[j][0].append(regs_init[j].params[1]) bets[j][1].append(regs_init[j].params[2]) R2adjs[j].append(regs_init[j].rsquared_adj) BICs[j].append(regs_init[j].bic) AICs[j].append(regs_init[j].aic) F_pvals[j].append(regs_init[j].f_pvalue) - - #bet1_1.append(reg1_init.params[1]) - #bet2_1.append(reg1_init.params[2]) - #bet1_2.append(reg2_init.params[1]) - #bet2_2.append(reg2_init.params[2]) - #R2adj_1.append(reg1_init.rsquared_adj) - #R2adj_2.append(reg2_init.rsquared_adj) - #BIC_1.append(reg1_init.bic) - #BIC_2.append(reg2_init.bic) - #AIC_1.append(reg1_init.aic) - #AIC_2.append(reg2_init.aic) - #F_pval_1.append(reg1_init.f_pvalue) - #F_pval_2.append(reg2_init.f_pvalue) - - H1 = Hs[0] - H2 = Hs[1] - Beta1 = Betas[0] - Beta2 = Betas[1] - H1_0 = Hs_0[0] - H2_0 = Hs_0[1] - Beta1_0 = Betas_0[0] - Beta2_0 = Betas_0[1] - margH1 = margHs[0] - margH2 = margHs[1] - gradH1 = gradHs[0] - gradH2 = gradHs[1] - distort1 = distorts[0] - distort2 = distorts[1] - sparsity1 = sparsitys[0] - sparsity2 = sparsitys[1] - pred1 = preds[0] - pred2 = preds[1] - R2adj_1 = R2adjs[0] - R2adj_2 = R2adjs[1] - BIC_1 = BICS[0] - BIC_2 = BICS[1] - AIC_1 = AICs[0] - AIC_2 = AICs[1] - F_pval_1 = F_pvals[0] - F_pval_2 = F_pvals[1] - bet1_1 = bets[0][0] - bet1_2 = bets[0][1] - bet2_1 = bets[1][0] - bet2_2 = bets[1][1] - - # Show the initial global loss value if self.verbose: print("Error after initialization step: %f" % (error[0])) # To keep track of the choice of etas - eta1, eta2 = ([] for i in range(2)) - eta1.append(self.params["eta1"]) - eta2.append(self.params["eta2"]) - etas = [eta1,eta2] + etas_init = [self.params["eta1"], self.params["eta2"]] # *** + etas = [[0] for _ in range(len(omics))] # *** + for j in range(len(etas)): + etas[j].append(etas_init[j]) # CHECK HERE ! - # Begin the optimization,k + # Begin the optimization start_time = process_time() for n_iter in range(1, self.max_iter + 1): @@ -1265,35 +1207,35 @@ class NMFProfiler: # Analytical solver... if self.solver == "analytical": - ( + ( # *** W, - H1, - H2, - Beta1, - Beta2, + Hs[0], + Hs[1], + Betas[0], + Betas[1], self.params["eta1"], self.params["eta2"], - margH1, - margH2, - gradH1, - gradH2, + margHs[0], + margHs[1], + gradHs[0], + gradHs[1], ) = _analytic_solver( - self.omic1, - self.omic2, + omics[0], + omics[1], Y, W, - H1, - H2, - Beta1, - Beta2, + Hs[0], + Hs[1], + Betas[0], + Betas[1], self.params, as_sklearn=self.as_sklearn, backtrack=self.backtrack, max_iter_back=self.max_iter_back, - H1_loss=margH1, - H2_loss=margH2, - H1_grad=gradH1, - H2_grad=gradH2, + H1_loss=margHs[0], + H2_loss=margHs[1], + H1_grad=gradHs[0], + H2_grad=gradHs[1], K=K, K_gp=K_gp, verbose=self.verbose, @@ -1301,22 +1243,23 @@ class NMFProfiler: # ... or Autograd solver else: - W, H1, H2, Beta1, Beta2 = _autograd_solver() + W, Hs[0], Hs[1], Betas[0], Betas[1] = _autograd_solver() # *** # Keep track of optimal etas - eta1.append(self.params["eta1"]) - eta2.append(self.params["eta2"]) + etas_current = [self.params["eta1"], self.params["eta1"]] # *** + for j in range(len(etas_current)): # *** + etas[j].append(etas_current[j]) # *** # Compute the new loss as well as specific terms - loss_ = _computeF( - self.omic1, - self.omic2, + loss_ = _computeF( # *** + omics[0], + omics[1], Y, W, - H1, - H2, - Beta1, - Beta2, + Hs[0], + Hs[1], + Betas[0], + Betas[1], self.params["mu"], self.params["lambda"], self.params["gamma"], @@ -1325,33 +1268,28 @@ class NMFProfiler: ) error.append(loss_[0]) nb_iters.append(n_iter) - distort1.append(loss_[1]) - distort2.append(loss_[2]) - sparsity1.append(loss_[3]) - sparsity2.append(loss_[4]) - regul.append(loss_[5]) - pred1.append(loss_[6]) - pred2.append(loss_[7]) + distorts[0].append(loss_[1]) # *** + distorts[1].append(loss_[2]) # *** + sparsitys[0].append(loss_[3]) # *** + sparsitys[1].append(loss_[4]) # *** + regul.append(loss_[5]) # *** + preds[0].append(loss_[6]) # *** + preds[1].append(loss_[7]) # *** # Monitor the LDA part - reg1 = sm.OLS( - self.y, sm.add_constant(safe_sparse_dot(self.omic1, H1.T)) - ).fit() - reg2 = sm.OLS( - self.y, sm.add_constant(safe_sparse_dot(self.omic2, H2.T)) - ).fit() - bet1_1.append(reg1.params[1]) - bet2_1.append(reg1.params[2]) - bet1_2.append(reg2.params[1]) - bet2_2.append(reg2.params[2]) - R2adj_1.append(reg1.rsquared_adj) - R2adj_2.append(reg2.rsquared_adj) - BIC_1.append(reg1.bic) - BIC_2.append(reg2.bic) - AIC_1.append(reg1.aic) - AIC_2.append(reg2.aic) - F_pval_1.append(reg1.f_pvalue) - F_pval_2.append(reg2.f_pvalue) + regs = [0] * len(omics) # *** + for j in range(len(omics)): # *** + regs[j] = sm.OLS( + self.y, sm.add_constant(safe_sparse_dot(omics[j], Hs[j].T)) + ).fit() + + for j in range(len(omics)): + bets[j][0].append(regs[j].params[1]) # *** + bets[j][1].append(regs[j].params[2]) # *** + R2adjs[j].append(regs[j].rsquared_adj) + BICs[j].append(regs[j].bic) + AICs[j].append(regs[j].aic) + F_pvals[j].append(regs[j].f_pvalue) # Every 10 iterations, if tol is still strictly positive and # verbose == True, compute the loss value @@ -1396,92 +1334,68 @@ class NMFProfiler: # -------------------------------- self.n_iter = n_iter self.W = W - self.H1 = H1 - self.H2 = H2 - self.Beta1 = Beta1 - self.Beta2 = Beta2 + self.H1 = Hs[0] # *** + self.H2 = Hs[1] # *** + self.Beta1 = Betas[0] # *** + self.Beta2 = Betas[1] # *** # Keep track of final Hj gradients and build the Hj gradients matrix # (following up their evolution during optimization) # ------------------------------------------------------------------ - gradH1.append( - multi_dot([W.T, W, H1]) - + ( - self.params["gamma"] - * multi_dot( - [ - np.transpose(Beta1[newaxis]), - Beta1[newaxis], - H1, - self.omic1.T, - self.omic1, - ] - ) - ) - - ( - safe_sparse_dot(W.T, self.omic1) + for j in range(len(omics)): # *** + gradHs[j].append( + multi_dot([W.T, W, Hs[j]]) + ( self.params["gamma"] * multi_dot( [ - np.transpose(Beta1[newaxis]), - self.y[newaxis], - self.omic1 + np.transpose(Betas[j][newaxis]), + Betas[j][newaxis], + Hs[j], + omics[j].T, + omics[j], ] ) ) - ) - ) - gradH2.append( - multi_dot([W.T, W, H2]) - + ( - self.params["gamma"] - * multi_dot( - [ - np.transpose(Beta2[newaxis]), - Beta2[newaxis], - H2, - self.omic2.T, - self.omic2, - ] - ) - ) - - ( - safe_sparse_dot(W.T, self.omic2) - + ( - self.params["gamma"] - * multi_dot( - [np.transpose(Beta2[newaxis]), - self.y[newaxis], self.omic2] + - ( + safe_sparse_dot(W.T, omics[j]) + + ( + self.params["gamma"] + * multi_dot( + [ + np.transpose(Betas[j][newaxis]), + self.y[newaxis], + omics[j] + ] + ) ) ) ) - ) grads = np.hstack( ( - np.vstack([norm(i) ** 2 for i in gradH1]), - np.vstack([norm(i) ** 2 for i in gradH2]), + np.vstack([norm(i) ** 2 for i in gradHs[0]]), # *** + np.vstack([norm(i) ** 2 for i in gradHs[1]]), # *** ) ) self.df_grads = pd.DataFrame(grads, columns=["grad_H1", "grad_H2"]) # Build the error terms matrix # ---------------------------- - error_terms = np.hstack( + error_terms = np.hstack( # *** ( np.vstack(nb_iters), - np.vstack(distort1), - np.vstack(distort2), - np.vstack(sparsity1), - np.vstack(sparsity2), + np.vstack(distorts[0]), + np.vstack(distorts[1]), + np.vstack(sparsitys[0]), + np.vstack(sparsitys[1]), np.vstack(regul), - np.vstack(pred1), - np.vstack(pred2), + np.vstack(preds[0]), + np.vstack(preds[1]), np.vstack(error), ) ) - self.df_errors = pd.DataFrame( + self.df_errors = pd.DataFrame( # *** error_terms, columns=[ "iter", @@ -1498,24 +1412,24 @@ class NMFProfiler: # Build the LDA performance matrix # -------------------------------- - LDA_perf = np.hstack( + LDA_perf = np.hstack( # *** ( np.vstack(nb_iters), - np.vstack(bet1_1), - np.vstack(bet1_2), - np.vstack(R2adj_1), - np.vstack(BIC_1), - np.vstack(AIC_1), - np.vstack(F_pval_1), - np.vstack(bet1_2), - np.vstack(bet2_2), - np.vstack(R2adj_2), - np.vstack(BIC_2), - np.vstack(AIC_2), - np.vstack(F_pval_2), + np.vstack(bets[0][0]), + np.vstack(bets[0][1]), + np.vstack(R2adjs[0]), + np.vstack(BICs[0]), + np.vstack(AICs[0]), + np.vstack(F_pvals[0]), + np.vstack(bets[1][0]), + np.vstack(bets[1][1]), + np.vstack(R2adjs[1]), + np.vstack(BICs[1]), + np.vstack(AICs[1]), + np.vstack(F_pvals[1]), ) ) - self.df_ldaperf = pd.DataFrame( + self.df_ldaperf = pd.DataFrame( # *** LDA_perf, columns=[ "iter", @@ -1537,8 +1451,8 @@ class NMFProfiler: # Build the etas matrix # (following up etas evolution during optimization) # ------------------------------------------------- - etas = np.hstack((np.vstack(eta1), np.vstack(eta2))) - self.df_etas = pd.DataFrame(etas, columns=["eta_omic1", "eta_omic2"]) + Etas = np.hstack((np.vstack(etas[0]), np.vstack(etas[1]))) # *** + self.df_etas = pd.DataFrame(Etas, columns=["eta_omic1", "eta_omic2"]) return self -- GitLab From 4f9f80b7eb0287762d34a5f1dd321f1e91043719 Mon Sep 17 00:00:00 2001 From: Aurelie Mercadie <aurelie.mercadie@inrae.fr> Date: Wed, 27 Nov 2024 12:30:01 +0100 Subject: [PATCH 09/19] generalized fit function (2nd step) --- nmfprofiler/nmfprofiler.py | 151 ++++++++++++++++--------------------- 1 file changed, 67 insertions(+), 84 deletions(-) diff --git a/nmfprofiler/nmfprofiler.py b/nmfprofiler/nmfprofiler.py index 336b9aa..baf3c4a 100644 --- a/nmfprofiler/nmfprofiler.py +++ b/nmfprofiler/nmfprofiler.py @@ -11,6 +11,8 @@ groups from copy import deepcopy +import itertools + import warnings import matplotlib.pyplot as plt import seaborn as sns @@ -1089,49 +1091,23 @@ class NMFProfiler: Betas = [deepcopy(Betas_0[j]) for j in range(len(omics))] # Create lists of error terms to enrich iteratively - ( # *** - error, - margH1, - margH2, - gradH1, - gradH2, - distort1, - distort2, - sparsity1, - sparsity2, - regul, - pred1, - pred2, - nb_iters, - ) = ([] for i in range(13)) - - margHs = [margH1, margH2] # *** - gradHs = [gradH1, gradH2] # *** - distorts = [distort1, distort2] # *** - sparsitys = [sparsity1, sparsity2] # *** - preds = [pred1, pred2] # *** + (error, regul, nb_iters) = ([] for i in range(3)) + ( + margHs, + gradHs, + distorts, + sparsitys, + preds, + ) = ([[] for j in range(len(omics))] for _ in range(5)) # Create lists of LDA performance indicators to enrich iteratively - ( # *** - R2adj_1, - R2adj_2, - BIC_1, - BIC_2, - AIC_1, - AIC_2, - F_pval_1, - F_pval_2, - bet1_1, - bet1_2, - bet2_1, - bet2_2, - ) = ([] for i in range(12)) - - R2adjs = [R2adj_1, R2adj_2] # *** - BICs = [BIC_1, BIC_2] # *** - AICs = [AIC_1, AIC_2] # *** - F_pvals = [F_pval_1, F_pval_2] # *** - bets = [[bet1_1, bet1_2], [bet2_1, bet2_2]] # *** + ( + R2adjs, + BICs, + AICs, + F_pvals, + ) = ([[] for j in range(len(omics))] for _ in range(4)) + bets = [[[] for k in range(K)] for j in range(len(omics))] loss_init = _computeF( # *** omics[0], @@ -1175,15 +1151,15 @@ class NMFProfiler: ) # LDAs with initial matrices - regs_init = [0] * len(omics) # *** + regs_init = [0 for j in range(len(omics))] # *** for j in range(len(omics)): # *** regs_init[j] = sm.OLS( self.y, sm.add_constant(safe_sparse_dot(omics[j], Hs_0[j].T)) ).fit() for j in range(len(omics)): # *** - bets[j][0].append(regs_init[j].params[1]) - bets[j][1].append(regs_init[j].params[2]) + for k in range(K): + bets[j][k].append(regs_init[j].params[k+1]) R2adjs[j].append(regs_init[j].rsquared_adj) BICs[j].append(regs_init[j].bic) AICs[j].append(regs_init[j].aic) @@ -1195,9 +1171,9 @@ class NMFProfiler: # To keep track of the choice of etas etas_init = [self.params["eta1"], self.params["eta2"]] # *** - etas = [[0] for _ in range(len(omics))] # *** - for j in range(len(etas)): - etas[j].append(etas_init[j]) # CHECK HERE ! + etas = [[] for j in range(len(omics))] # *** + for j in range(len(omics)): + etas[j].append(etas_init[j]) # Begin the optimization start_time = process_time() @@ -1246,7 +1222,7 @@ class NMFProfiler: W, Hs[0], Hs[1], Betas[0], Betas[1] = _autograd_solver() # *** # Keep track of optimal etas - etas_current = [self.params["eta1"], self.params["eta1"]] # *** + etas_current = [self.params["eta1"], self.params["eta2"]] # *** for j in range(len(etas_current)): # *** etas[j].append(etas_current[j]) # *** @@ -1277,15 +1253,15 @@ class NMFProfiler: preds[1].append(loss_[7]) # *** # Monitor the LDA part - regs = [0] * len(omics) # *** + regs = [0 for j in range(len(omics))] # *** for j in range(len(omics)): # *** regs[j] = sm.OLS( self.y, sm.add_constant(safe_sparse_dot(omics[j], Hs[j].T)) ).fit() for j in range(len(omics)): - bets[j][0].append(regs[j].params[1]) # *** - bets[j][1].append(regs[j].params[2]) # *** + for k in range(K): + bets[j][k].append(regs[j].params[k+1]) R2adjs[j].append(regs[j].rsquared_adj) BICs[j].append(regs[j].bic) AICs[j].append(regs[j].aic) @@ -1373,12 +1349,16 @@ class NMFProfiler: ) grads = np.hstack( - ( - np.vstack([norm(i) ** 2 for i in gradHs[0]]), # *** - np.vstack([norm(i) ** 2 for i in gradHs[1]]), # *** - ) + ([ + np.vstack( + [norm(i) ** 2 for i in gradHs[j]] + ) for j in range(len(omics)) # *** + ]) + ) + self.df_grads = pd.DataFrame( + grads, + columns=[f'grad_H{j+1}' for j in range(len(omics))] # *** ) - self.df_grads = pd.DataFrame(grads, columns=["grad_H1", "grad_H2"]) # Build the error terms matrix # ---------------------------- @@ -1395,19 +1375,26 @@ class NMFProfiler: np.vstack(error), ) ) - self.df_errors = pd.DataFrame( # *** + # error_terms = np.hstack( # *** + # ( + # np.vstack(nb_iters), + # (np.vstack(distorts[j]) for j in range(len(omics))), + # (np.vstack(sparsitys[j]) for j in range(len(omics))), + # np.vstack(regul), + # (np.vstack(preds[j]) for j in range(len(omics))), + # np.vstack(error), + # ) + # ) + self.df_errors = pd.DataFrame( error_terms, - columns=[ - "iter", - "distort1", - "distort2", - "sparsity1", - "sparsity2", - "regul", - "pred1", - "pred2", - "loss", - ], + columns=list(itertools.chain.from_iterable([ + ["iter"], + [f"distort{j+1}" for j in range(len(omics))], # *** + [f"sparsity{j+1}" for j in range(len(omics))], # *** + ["regul"], + [f"pred{j+1}" for j in range(len(omics))], # *** + ["loss"], + ])), ) # Build the LDA performance matrix @@ -1429,23 +1416,19 @@ class NMFProfiler: np.vstack(F_pvals[1]), ) ) - self.df_ldaperf = pd.DataFrame( # *** + self.df_ldaperf = pd.DataFrame( LDA_perf, - columns=[ - "iter", - "Comp.1 coef (omic1)", - "Comp.2 coef (omic1)", - "R2 Adjusted (omic1)", - "BIC (omic1)", - "AIC (omic1)", - "F-statistic p-value (omic1)", - "Comp.1 coef (omic2)", - "Comp.2 coef (omic2)", - "R2 Adjusted (omic2)", - "BIC (omic2)", - "AIC (omic2)", - "F-statistic p-value (omic2)", - ], + columns=list(itertools.chain.from_iterable([ + ["iter"], + list(itertools.chain.from_iterable([[ + f"Comp.1 coef (omic{j+1})", + f"Comp.2 coef (omic{j+1})", + f"R2 Adjusted (omic{j+1})", + f"BIC (omic{j+1})", + f"AIC (omic{j+1})", + f"F-statistic p-value (omic{j+1})" + ] for j in range(len(omics))])), # *** + ])), ) # Build the etas matrix -- GitLab From 83832831a8e29b34afc608fbd49fbf61e780e4e1 Mon Sep 17 00:00:00 2001 From: Aurelie Mercadie <aurelie.mercadie@inrae.fr> Date: Wed, 27 Nov 2024 14:32:03 +0100 Subject: [PATCH 10/19] generalized fit function (last step) --- nmfprofiler/nmfprofiler.py | 94 +++++++++++++++----------------------- 1 file changed, 38 insertions(+), 56 deletions(-) diff --git a/nmfprofiler/nmfprofiler.py b/nmfprofiler/nmfprofiler.py index baf3c4a..5f3f6d6 100644 --- a/nmfprofiler/nmfprofiler.py +++ b/nmfprofiler/nmfprofiler.py @@ -1050,14 +1050,13 @@ class NMFProfiler: _check_inputs_NMFProfiler(self) # Extract K, the number of latent components, as equal - # to the number of distinct groups in y + # to the number of distinct groups, U, in y K = len(np.unique(self.y)) # For now, initialize K_gp, the number of latent components # dedicated to groups, identically to K K_gp = len(np.unique(self.y)) - # Automatically convert a vector encoding U groups into - # a one-hot encoded matrix + # Automatically convert y into a one-hot encoded matrix Y encoder = OneHotEncoder(handle_unknown="ignore") Y = encoder.fit_transform(self.y.reshape(-1, 1)).toarray() @@ -1087,8 +1086,8 @@ class NMFProfiler: # Create matrices and vectors to update using initialization # (and keep intact initialized matrices and vectors) W = deepcopy(W0) - Hs = [deepcopy(Hs_0[j]) for j in range(len(omics))] - Betas = [deepcopy(Betas_0[j]) for j in range(len(omics))] + Hs = [deepcopy(Hs_0[j]) for j in range(len(omics))] # *** + Betas = [deepcopy(Betas_0[j]) for j in range(len(omics))] # *** # Create lists of error terms to enrich iteratively (error, regul, nb_iters) = ([] for i in range(3)) @@ -1098,7 +1097,7 @@ class NMFProfiler: distorts, sparsitys, preds, - ) = ([[] for j in range(len(omics))] for _ in range(5)) + ) = ([[] for j in range(len(omics))] for _ in range(5)) # *** # Create lists of LDA performance indicators to enrich iteratively ( @@ -1106,8 +1105,8 @@ class NMFProfiler: BICs, AICs, F_pvals, - ) = ([[] for j in range(len(omics))] for _ in range(4)) - bets = [[[] for k in range(K)] for j in range(len(omics))] + ) = ([[] for j in range(len(omics))] for _ in range(4)) # *** + bets = [[[] for k in range(K)] for j in range(len(omics))] # *** loss_init = _computeF( # *** omics[0], @@ -1136,7 +1135,7 @@ class NMFProfiler: # Keep track of marginal objective functions in Hj with # initial matrices (necessary for linesearch first execution) - for j in range(len(omics)): + for j in range(len(omics)): # *** margHs[j].append( _computeMargH( omics[j], @@ -1172,7 +1171,7 @@ class NMFProfiler: # To keep track of the choice of etas etas_init = [self.params["eta1"], self.params["eta2"]] # *** etas = [[] for j in range(len(omics))] # *** - for j in range(len(omics)): + for j in range(len(omics)): # *** etas[j].append(etas_init[j]) # Begin the optimization @@ -1348,13 +1347,11 @@ class NMFProfiler: ) ) - grads = np.hstack( - ([ - np.vstack( - [norm(i) ** 2 for i in gradHs[j]] - ) for j in range(len(omics)) # *** - ]) - ) + grads = np.array( # *** + [ + [norm(i) ** 2 for i in gradHs[j]] for j in range(len(omics)) + ] + ).T self.df_grads = pd.DataFrame( grads, columns=[f'grad_H{j+1}' for j in range(len(omics))] # *** @@ -1362,29 +1359,16 @@ class NMFProfiler: # Build the error terms matrix # ---------------------------- - error_terms = np.hstack( # *** - ( + error_terms = np.hstack( + [ np.vstack(nb_iters), - np.vstack(distorts[0]), - np.vstack(distorts[1]), - np.vstack(sparsitys[0]), - np.vstack(sparsitys[1]), + np.array(distorts).T, + np.array(sparsitys).T, np.vstack(regul), - np.vstack(preds[0]), - np.vstack(preds[1]), + np.array(preds).T, np.vstack(error), - ) + ] ) - # error_terms = np.hstack( # *** - # ( - # np.vstack(nb_iters), - # (np.vstack(distorts[j]) for j in range(len(omics))), - # (np.vstack(sparsitys[j]) for j in range(len(omics))), - # np.vstack(regul), - # (np.vstack(preds[j]) for j in range(len(omics))), - # np.vstack(error), - # ) - # ) self.df_errors = pd.DataFrame( error_terms, columns=list(itertools.chain.from_iterable([ @@ -1399,30 +1383,26 @@ class NMFProfiler: # Build the LDA performance matrix # -------------------------------- - LDA_perf = np.hstack( # *** - ( - np.vstack(nb_iters), - np.vstack(bets[0][0]), - np.vstack(bets[0][1]), - np.vstack(R2adjs[0]), - np.vstack(BICs[0]), - np.vstack(AICs[0]), - np.vstack(F_pvals[0]), - np.vstack(bets[1][0]), - np.vstack(bets[1][1]), - np.vstack(R2adjs[1]), - np.vstack(BICs[1]), - np.vstack(AICs[1]), - np.vstack(F_pvals[1]), + LDA_perf = np.vstack(nb_iters) + for j in range(len(omics)): + bets_j = np.array(bets[j]).T + perf_j = np.array( + [ + R2adjs[j], + BICs[j], + AICs[j], + F_pvals[j], + ] + ).T + LDA_perf = np.hstack( + [LDA_perf, bets_j, perf_j] ) - ) self.df_ldaperf = pd.DataFrame( LDA_perf, columns=list(itertools.chain.from_iterable([ ["iter"], list(itertools.chain.from_iterable([[ - f"Comp.1 coef (omic{j+1})", - f"Comp.2 coef (omic{j+1})", + *[f"Comp.{k+1} coef (omic{j+1})" for k in range(K)], f"R2 Adjusted (omic{j+1})", f"BIC (omic{j+1})", f"AIC (omic{j+1})", @@ -1434,8 +1414,10 @@ class NMFProfiler: # Build the etas matrix # (following up etas evolution during optimization) # ------------------------------------------------- - Etas = np.hstack((np.vstack(etas[0]), np.vstack(etas[1]))) # *** - self.df_etas = pd.DataFrame(Etas, columns=["eta_omic1", "eta_omic2"]) + self.df_etas = pd.DataFrame( + np.array(etas).T, + columns=[f"eta_omic{j+1}" for j in range(len(omics))] # *** + ) return self -- GitLab From a071ad6c6cb9144cd0d4863d4dfeb5934e3699f7 Mon Sep 17 00:00:00 2001 From: Aurelie Mercadie <aurelie.mercadie@inrae.fr> Date: Wed, 27 Nov 2024 15:43:43 +0100 Subject: [PATCH 11/19] extended generalization + checked plot methods --- nmfprofiler/nmfprofiler.py | 185 +++++++++++++++++++------------------ 1 file changed, 95 insertions(+), 90 deletions(-) diff --git a/nmfprofiler/nmfprofiler.py b/nmfprofiler/nmfprofiler.py index 5f3f6d6..2edeff6 100644 --- a/nmfprofiler/nmfprofiler.py +++ b/nmfprofiler/nmfprofiler.py @@ -39,7 +39,7 @@ def _prox_l1_pos(v, lambd): return np.maximum(v - lambd, 0) -def _update_W(W, omic1, omic2, H1, H2, mu): +def _update_W(W, omic1, omic2, H1, H2, mu): # *** """Update the matrix containing contributions of individuals to latent components. @@ -63,8 +63,8 @@ def _update_W(W, omic1, omic2, H1, H2, mu): ------- Newly updated W. """ - omics = [omic1, omic2] - Hs = [H1, H2] + omics = [omic1, omic2] # *** + Hs = [H1, H2] # *** B = mu * np.identity(W.shape[1]) for j in range(len(Hs)): @@ -195,7 +195,7 @@ def _update_Beta(omic, h_k, y_k, gamma, Beta): return Beta -def _computeF( +def _computeF( # *** omic1, omic2, Y, @@ -248,9 +248,9 @@ def _computeF( Either the value of the objective function (i.e. the global loss) alone, or accompagned by each specific term to obtain it. """ - omics = [omic1, omic2] - Hs = [H1, H2] - Betas = [Beta1, Beta2] + omics = [omic1, omic2] # *** + Hs = [H1, H2] # *** + Betas = [Beta1, Beta2] # *** regul = (mu / 2) * np.trace(safe_sparse_dot(W.T, W)) distort, sparse, pred = ([] for i in range(3)) @@ -267,7 +267,7 @@ def _computeF( loss = sum(distort) + regul + sum(sparse) + sum(pred) if details: - res = sum([[loss], distort, sparse, [regul], pred], []) + res = sum([[loss], distort, sparse, [regul], pred], []) # *** return loss if details is False else res @@ -418,7 +418,7 @@ def _linesearch( return H, H_loss, current_eta -def _analytic_solver( +def _analytic_solver( # *** omic1, omic2, Y, @@ -508,15 +508,15 @@ def _analytic_solver( See (Fevotte, 2011) and sklearn.decomposition.NMF source code for choice of parameters. """ - omics = [omic1, omic2] - Hs = [H1, H2] - Betas = [Beta1, Beta2] - Hlosses = [H1_loss, H2_loss] - Hgrads = [H1_grad, H2_grad] - etas = [params["eta1"], params["eta2"]] + omics = [omic1, omic2] # *** + Hs = [H1, H2] # *** + Betas = [Beta1, Beta2] # *** + Hlosses = [H1_loss, H2_loss] # *** + Hgrads = [H1_grad, H2_grad] # *** + etas = [params["eta1"], params["eta2"]] # *** # W update with new values of Hs - W_new = _update_W(W, omics[0], omics[1], Hs[0], Hs[1], params["mu"]) + W_new = _update_W(W, omics[0], omics[1], Hs[0], Hs[1], params["mu"]) # *** # Hs updates (either MU or Proximal) Hs_new = [0] * len(omics) @@ -572,43 +572,35 @@ def _analytic_solver( # Betas updates with new values of Hs # Compute each regression coef. of each component separately # and gather them in a unique vector - Betas_new = [0] * len(omics) + Betas_new = [0 for j in range(len(omics))] for j in range(len(omics)): Betas_new[j] = np.array( [ _update_Beta( omics[j], - Hs_new[j][0, :], - Y[:, 0], + Hs_new[j][k, :], + Y[:, k], params["gamma"], - Betas[j][0] - ), - _update_Beta( - omics[j], - Hs_new[j][1, :], - Y[:, 1], - params["gamma"], - Betas[j][1] - ), + Betas[j][k] + ) for k in range(K) ] ) - # Put to zero all coefficients linked to components k when k > K_gp if K_gp < K: Betas_new[j][K_gp:] = 0 - H1_new = Hs_new[0] - H2_new = Hs_new[1] - Beta1_new = Betas_new[0] - Beta2_new = Betas_new[1] - params["eta1"] = etas[0] - params["eta2"] = etas[1] - H1_loss = Hlosses[0] - H2_loss = Hlosses[1] - H1_grad = Hgrads[0] - H2_grad = Hgrads[1] - - return ( + H1_new = Hs_new[0] # *** + H2_new = Hs_new[1] # *** + Beta1_new = Betas_new[0] # *** + Beta2_new = Betas_new[1] # *** + params["eta1"] = etas[0] # *** + params["eta2"] = etas[1] # *** + H1_loss = Hlosses[0] # *** + H2_loss = Hlosses[1] # *** + H1_grad = Hgrads[0] # *** + H2_grad = Hgrads[1] # *** + + return ( # *** W_new, H1_new, H2_new, @@ -842,12 +834,12 @@ class NMFProfiler: >>> model.barplot_error(height=6, width=15, path="") """ - def __init__( + def __init__( # *** self, omic1, omic2, y, - params={ + params={ # *** "gamma": 1e-2, "lambda": 1e-3, "mu": 1e-3, @@ -868,8 +860,8 @@ class NMFProfiler: verbose=False, ): - self.omic1 = omic1 - self.omic2 = omic2 + self.omic1 = omic1 # *** + self.omic2 = omic2 # *** self.y = y self.params = params self.init_method = init_method @@ -882,7 +874,7 @@ class NMFProfiler: self.seed = seed self.verbose = verbose - def __str__(self): + def __str__(self): # *** """Briefly describe inputs and outputs of NMFProfiler.""" print_statement = ( "NMFProfiler\n-----------\n\nAnalysis run on dataset 1 containing " @@ -916,20 +908,20 @@ class NMFProfiler: Apply a min-max normalization and divide by the square root of the number of features. """ - omics = [self.omic1, self.omic2] + omics = [self.omic1, self.omic2] # *** # For each dataset - for j in range(len(omics)): - for i in range(omics[j].shape[1]): - omics[j][:, i] = ( - omics[j][:, i] - np.min(omics[j][:, i]) + for j in range(len(omics)): # *** + for i in range(omics[j].shape[1]): # *** + omics[j][:, i] = ( # *** + omics[j][:, i] - np.min(omics[j][:, i]) # *** ) / ( - np.max(omics[j][:, i]) - np.min(omics[j][:, i]) + np.max(omics[j][:, i]) - np.min(omics[j][:, i]) # *** ) - omics[j] = omics[j] * (1 / np.sqrt(omics[j].shape[1])) + omics[j] = omics[j] * (1 / np.sqrt(omics[j].shape[1])) # *** - self.omic1 = omics[0] - self.omic2 = omics[1] + self.omic1 = omics[0] # *** + self.omic2 = omics[1] # *** def _initialize_w_h_beta(self): """Initialize matrices W, H^j and Beta^j. @@ -941,7 +933,7 @@ class NMFProfiler: ---- Based on _initialize_nmf of sklearn.decomposition.NMF. """ - omicstemp = [self.omic1, self.omic2] + omicstemp = [self.omic1, self.omic2] # *** # Extract K, the number of latent components, as equal to # the number of distinct groups in y @@ -952,9 +944,9 @@ class NMFProfiler: # For W, H1 and H2: # 1.0. Initialize H0s list - H0s = [0] * len(omicstemp) + H0s = [0 for j in range(len(omicstemp))] # *** # 1.1. Concatenate omics data sets - omics = np.concatenate(omicstemp, axis=1) + omics = np.concatenate(omicstemp, axis=1) # *** # 1.2. Initialize using sklearn function if self.init_method == "random2": # 1.2a. Initialize W with both omics and Hj @@ -965,9 +957,9 @@ class NMFProfiler: init="random", random_state=self.seed ) - for j in range(len(omicstemp)): + for j in range(len(omicstemp)): # *** *_, H0s[j] = _initialize_nmf( - X=omicstemp[j], + X=omicstemp[j], # *** n_components=K, init="random", random_state=self.seed @@ -977,12 +969,12 @@ class NMFProfiler: # 1.2b. FOR IDENTICAL OMICS DATA SETS - Initialize W # with both omics, H1 with omic1 and H2 as H1 (random) W0, H0s[0] = _initialize_nmf( - X=omicstemp[0], + X=omicstemp[0], # *** n_components=K, init="random", random_state=self.seed ) - for j in range(1, len(omicstemp)): + for j in range(1, len(omicstemp)): # *** H0s[j] = H0s[0].copy() elif self.init_method == "nndsvd2": # 1.2c. Initialize W with both omics and Hj @@ -993,9 +985,9 @@ class NMFProfiler: init="nndsvd", random_state=self.seed ) - for j in range(len(omicstemp)): + for j in range(len(omicstemp)): # *** *_, H0s[j] = _initialize_nmf( - X=omicstemp[j], + X=omicstemp[j], # *** n_components=K, init="nndsvd", random_state=self.seed @@ -1004,15 +996,15 @@ class NMFProfiler: elif self.init_method == "nndsvd3": # 1.2d. Initialize W with each omic then take the mean and # initialize Hj with specific omic (nndsvd) - W_js = [0] * len(omicstemp) - for j in range(len(omicstemp)): + W_js = [0 for j in range(len(omics))] # *** + for j in range(len(omicstemp)): # *** W_js[j], H0s[j] = _initialize_nmf( - X=omicstemp[j], + X=omicstemp[j], # *** n_components=K, init="nndsvd", random_state=self.seed ) - W0 = (1 / len(omicstemp)) * (sum(W_js)) + W0 = (1 / len(omicstemp)) * (sum(W_js)) # *** del W_js else: # 1.2e. Initialize both with all omics, using whatever method @@ -1024,25 +1016,33 @@ class NMFProfiler: random_state=self.seed ) # 1.2e. Split H matrix + if len(omicstemp) == 2: + indices_or_sections = [omicstemp[0].shape[1]] # *** + else: + indices_or_sections = [ + omicstemp[j].shape[1] for j in range(len(omicstemp)) # *** + ] H0s = np.split( - ary=H0, indices_or_sections=[self.omic1.shape[1]], axis=1 + ary=H0, + indices_or_sections=indices_or_sections, + axis=1 ) del H0 # For Betas: - Beta0s = [0] * len(omicstemp) - for j in range(len(omicstemp)): + Beta0s = [0 for j in range(len(omics))] # *** + for j in range(len(omicstemp)): # *** Beta0s[j] = np.repeat(1, K) # Put to zero all coefficients linked to component k when k > K_gp if K_gp < K: Beta0s[j][K_gp:] = 0 - H1_0 = H0s[0] - H2_0 = H0s[1] - Beta1_0 = Beta0s[0] - Beta2_0 = Beta0s[1] + H1_0 = H0s[0] # *** + H2_0 = H0s[1] # *** + Beta1_0 = Beta0s[0] # *** + Beta2_0 = Beta0s[1] # *** - return W0, H1_0, H2_0, Beta1_0, Beta2_0 + return W0, H1_0, H2_0, Beta1_0, Beta2_0 # *** def fit(self): """Run NMFProfiler.""" @@ -1436,7 +1436,7 @@ class NMFProfiler: in each omic. :projs: list of arrays. Projection onto each Hj matrix """ - Hs = [self.H1, self.H2] + Hs = [self.H1, self.H2] # *** # Initialize projection list and compute projs = [] @@ -1475,19 +1475,22 @@ class NMFProfiler: ------ Return a barplot of the different error terms. """ - data_barplot = [ - ["reconstruction omic1", self.df_errors.iloc[-1, 1]], - ["reconstruction omic2", self.df_errors.iloc[-1, 2]], - ["l2 penalty on W", self.df_errors.iloc[-1, 5]], - ["l1 penalty on H1", self.df_errors.iloc[-1, 3]], - ["l1 penalty on H2", self.df_errors.iloc[-1, 4]], - ["supervised part omic1", self.df_errors.iloc[-1, 6]], - ["supervised part omic2", self.df_errors.iloc[-1, 7]], - ] + J = len([self.omic1, self.omic2]) # *** + data_barplot = np.array([ + [ + *[f"reconstruction omic{j+1}" for j in range(J)], + *[f"l1 penalty on H{j+1}" for j in range(J)], + "l2 penalty on W", + *[f"supervised part omic{j+1}" for j in range(J)], + ], + self.df_errors.iloc[-1, 1:-1].tolist(), + ]).T df_bar = pd.DataFrame(data_barplot, columns=["part", "value"]) + df_bar["value"] = df_bar["value"].astype(float) plt.figure(figsize=(width, height)) - sns.barplot(data=df_bar, x="part", y="value") + g = sns.barplot(data=df_bar, x="part", y="value") + g.set(xlabel=None) plt.savefig(str(path + "BarplotErrors.png")) plt.show() @@ -1543,8 +1546,8 @@ class NMFProfiler: """ plt.figure(figsize=(width, height)) - omics = [self.omic1, self.omic2] - Hs = [self.H1, self.H2] + omics = [self.omic1, self.omic2] # *** + Hs = [self.H1, self.H2] # *** if obj_to_viz == "W": sns.heatmap(pd.DataFrame(self.W), cmap="viridis") @@ -1562,7 +1565,9 @@ class NMFProfiler: if obj_to_viz == "W": plotname = str(path + obj_to_viz + "_Heatmap.png") else: - plotname = str(path + obj_to_viz + omic_number + "_Heatmap.png") + plotname = str( + path + obj_to_viz + str(omic_number) + "_Heatmap.png" + ) plt.savefig(plotname) plt.show() -- GitLab From acd854ab86f7d15ef3c8d51759d4a45a1c788a00 Mon Sep 17 00:00:00 2001 From: Aurelie Mercadie <aurelie.mercadie@inrae.fr> Date: Wed, 27 Nov 2024 16:37:54 +0100 Subject: [PATCH 12/19] adapted attributes --- nmfprofiler/nmfprofiler.py | 342 +++++++++++---------------- nmfprofiler/utils/validate_inputs.py | 29 ++- 2 files changed, 150 insertions(+), 221 deletions(-) diff --git a/nmfprofiler/nmfprofiler.py b/nmfprofiler/nmfprofiler.py index 2edeff6..0380bdd 100644 --- a/nmfprofiler/nmfprofiler.py +++ b/nmfprofiler/nmfprofiler.py @@ -39,7 +39,7 @@ def _prox_l1_pos(v, lambd): return np.maximum(v - lambd, 0) -def _update_W(W, omic1, omic2, H1, H2, mu): # *** +def _update_W(W, omics, Hs, mu): """Update the matrix containing contributions of individuals to latent components. @@ -63,9 +63,6 @@ def _update_W(W, omic1, omic2, H1, H2, mu): # *** ------- Newly updated W. """ - omics = [omic1, omic2] # *** - Hs = [H1, H2] # *** - B = mu * np.identity(W.shape[1]) for j in range(len(Hs)): B += safe_sparse_dot(Hs[j], Hs[j].T) @@ -195,15 +192,12 @@ def _update_Beta(omic, h_k, y_k, gamma, Beta): return Beta -def _computeF( # *** - omic1, - omic2, +def _computeF( + omics, Y, W, - H1, - H2, - Beta1, - Beta2, + Hs, + Betas, mu, lambdA, gamma, @@ -248,10 +242,6 @@ def _computeF( # *** Either the value of the objective function (i.e. the global loss) alone, or accompagned by each specific term to obtain it. """ - omics = [omic1, omic2] # *** - Hs = [H1, H2] # *** - Betas = [Beta1, Beta2] # *** - regul = (mu / 2) * np.trace(safe_sparse_dot(W.T, W)) distort, sparse, pred = ([] for i in range(3)) for j in range(len(omics)): @@ -267,7 +257,7 @@ def _computeF( # *** loss = sum(distort) + regul + sum(sparse) + sum(pred) if details: - res = sum([[loss], distort, sparse, [regul], pred], []) # *** + res = [loss, distort, sparse, regul, pred] return loss if details is False else res @@ -418,23 +408,18 @@ def _linesearch( return H, H_loss, current_eta -def _analytic_solver( # *** - omic1, - omic2, +def _analytic_solver( + omics, Y, W, - H1, - H2, - Beta1, - Beta2, + Hs, + Betas, params, as_sklearn, backtrack, max_iter_back, - H1_loss, - H2_loss, - H1_grad, - H2_grad, + Hlosses, + Hgrads, K, K_gp, verbose, @@ -508,18 +493,13 @@ def _analytic_solver( # *** See (Fevotte, 2011) and sklearn.decomposition.NMF source code for choice of parameters. """ - omics = [omic1, omic2] # *** - Hs = [H1, H2] # *** - Betas = [Beta1, Beta2] # *** - Hlosses = [H1_loss, H2_loss] # *** - Hgrads = [H1_grad, H2_grad] # *** etas = [params["eta1"], params["eta2"]] # *** # W update with new values of Hs - W_new = _update_W(W, omics[0], omics[1], Hs[0], Hs[1], params["mu"]) # *** + W_new = _update_W(W, omics, Hs, params["mu"]) # Hs updates (either MU or Proximal) - Hs_new = [0] * len(omics) + Hs_new = [0 for j in range(len(omics))] for j in range(len(omics)): Hs_new[j], Hgrads[j] = _update_H( Hs[j], @@ -589,29 +569,17 @@ def _analytic_solver( # *** if K_gp < K: Betas_new[j][K_gp:] = 0 - H1_new = Hs_new[0] # *** - H2_new = Hs_new[1] # *** - Beta1_new = Betas_new[0] # *** - Beta2_new = Betas_new[1] # *** params["eta1"] = etas[0] # *** params["eta2"] = etas[1] # *** - H1_loss = Hlosses[0] # *** - H2_loss = Hlosses[1] # *** - H1_grad = Hgrads[0] # *** - H2_grad = Hgrads[1] # *** - return ( # *** + return ( W_new, - H1_new, - H2_new, - Beta1_new, - Beta2_new, - params["eta1"], - params["eta2"], - H1_loss, - H2_loss, - H1_grad, - H2_grad, + Hs_new, + Betas_new, + params["eta1"], # *** + params["eta2"], # *** + Hlosses, + Hgrads, ) @@ -834,10 +802,9 @@ class NMFProfiler: >>> model.barplot_error(height=6, width=15, path="") """ - def __init__( # *** + def __init__( self, - omic1, - omic2, + omics, y, params={ # *** "gamma": 1e-2, @@ -860,8 +827,7 @@ class NMFProfiler: verbose=False, ): - self.omic1 = omic1 # *** - self.omic2 = omic2 # *** + self.omics = omics self.y = y self.params = params self.init_method = init_method @@ -874,18 +840,18 @@ class NMFProfiler: self.seed = seed self.verbose = verbose - def __str__(self): # *** + def __str__(self): """Briefly describe inputs and outputs of NMFProfiler.""" print_statement = ( - "NMFProfiler\n-----------\n\nAnalysis run on dataset 1 containing " - f"{self.omic1.shape[1]} features measured on {self.omic1.shape[0]}" - f" samples,\ndataset 2 containing {self.omic2.shape[1]} features " - f"measured on the same {self.omic2.shape[0]} samples.\nSamples " - f"are splitted into {len(np.unique(self.y))} distinct groups.\n\n" - f"NMFProfiler (as_sklearn = {self.as_sklearn}) extracted " - f"{len(np.unique(self.y))} profiles in {self.runningtime} " - "seconds.\n\nFor more information, please refer to help(), " - "GitLab page or contact aurelie.mercadie@inrae.fr." + f"NMFProfiler\n-----------\n\nAnalysis run on {len(self.omics)} " + "datasets containing respectively " + f"{*[self.omics[j].shape[1] for j in range(len(self.omics))],} " + f"features measured on the same {self.omics[0].shape[0]} samples." + f"\nSamples are splitted into {len(np.unique(self.y))} distinct" + f" groups.\n\nNMFProfiler (as_sklearn = {self.as_sklearn}) " + f"extracted {len(np.unique(self.y))} profiles " + f"in {self.runningtime} seconds.\n\nFor more information, please " + "refer to help() or GitLab page." ) return print_statement @@ -908,20 +874,17 @@ class NMFProfiler: Apply a min-max normalization and divide by the square root of the number of features. """ - omics = [self.omic1, self.omic2] # *** - # For each dataset - for j in range(len(omics)): # *** - for i in range(omics[j].shape[1]): # *** - omics[j][:, i] = ( # *** - omics[j][:, i] - np.min(omics[j][:, i]) # *** + for j in range(len(self.omics)): + for i in range(self.omics[j].shape[1]): + self.omics[j][:, i] = ( + self.omics[j][:, i] - np.min(self.omics[j][:, i]) ) / ( - np.max(omics[j][:, i]) - np.min(omics[j][:, i]) # *** + np.max(self.omics[j][:, i]) - np.min(self.omics[j][:, i]) ) - omics[j] = omics[j] * (1 / np.sqrt(omics[j].shape[1])) # *** - - self.omic1 = omics[0] # *** - self.omic2 = omics[1] # *** + self.omics[j] = ( + self.omics[j] * (1 / np.sqrt(self.omics[j].shape[1])) + ) def _initialize_w_h_beta(self): """Initialize matrices W, H^j and Beta^j. @@ -933,8 +896,6 @@ class NMFProfiler: ---- Based on _initialize_nmf of sklearn.decomposition.NMF. """ - omicstemp = [self.omic1, self.omic2] # *** - # Extract K, the number of latent components, as equal to # the number of distinct groups in y K = len(np.unique(self.y)) @@ -944,9 +905,9 @@ class NMFProfiler: # For W, H1 and H2: # 1.0. Initialize H0s list - H0s = [0 for j in range(len(omicstemp))] # *** + H0s = [0 for j in range(len(self.omics))] # 1.1. Concatenate omics data sets - omics = np.concatenate(omicstemp, axis=1) # *** + omics = np.concatenate(self.omics, axis=1) # 1.2. Initialize using sklearn function if self.init_method == "random2": # 1.2a. Initialize W with both omics and Hj @@ -957,9 +918,9 @@ class NMFProfiler: init="random", random_state=self.seed ) - for j in range(len(omicstemp)): # *** + for j in range(len(self.omics)): *_, H0s[j] = _initialize_nmf( - X=omicstemp[j], # *** + X=self.omics[j], n_components=K, init="random", random_state=self.seed @@ -969,12 +930,12 @@ class NMFProfiler: # 1.2b. FOR IDENTICAL OMICS DATA SETS - Initialize W # with both omics, H1 with omic1 and H2 as H1 (random) W0, H0s[0] = _initialize_nmf( - X=omicstemp[0], # *** + X=self.omics[0], n_components=K, init="random", random_state=self.seed ) - for j in range(1, len(omicstemp)): # *** + for j in range(1, len(self.omics)): H0s[j] = H0s[0].copy() elif self.init_method == "nndsvd2": # 1.2c. Initialize W with both omics and Hj @@ -985,9 +946,9 @@ class NMFProfiler: init="nndsvd", random_state=self.seed ) - for j in range(len(omicstemp)): # *** + for j in range(len(self.omics)): *_, H0s[j] = _initialize_nmf( - X=omicstemp[j], # *** + X=self.omics[j], n_components=K, init="nndsvd", random_state=self.seed @@ -996,15 +957,15 @@ class NMFProfiler: elif self.init_method == "nndsvd3": # 1.2d. Initialize W with each omic then take the mean and # initialize Hj with specific omic (nndsvd) - W_js = [0 for j in range(len(omics))] # *** - for j in range(len(omicstemp)): # *** + W_js = [0 for j in range(len(self.omics))] + for j in range(len(self.omics)): W_js[j], H0s[j] = _initialize_nmf( - X=omicstemp[j], # *** + X=self.omics[j], n_components=K, init="nndsvd", random_state=self.seed ) - W0 = (1 / len(omicstemp)) * (sum(W_js)) # *** + W0 = (1 / len(self.omics)) * (sum(W_js)) del W_js else: # 1.2e. Initialize both with all omics, using whatever method @@ -1016,11 +977,11 @@ class NMFProfiler: random_state=self.seed ) # 1.2e. Split H matrix - if len(omicstemp) == 2: - indices_or_sections = [omicstemp[0].shape[1]] # *** + if len(self.omics) == 2: + indices_or_sections = [self.omics[0].shape[1]] else: indices_or_sections = [ - omicstemp[j].shape[1] for j in range(len(omicstemp)) # *** + self.omics[j].shape[1] for j in range(len(self.omics)) ] H0s = np.split( ary=H0, @@ -1030,19 +991,14 @@ class NMFProfiler: del H0 # For Betas: - Beta0s = [0 for j in range(len(omics))] # *** - for j in range(len(omicstemp)): # *** + Beta0s = [0 for j in range(len(self.omics))] + for j in range(len(self.omics)): Beta0s[j] = np.repeat(1, K) # Put to zero all coefficients linked to component k when k > K_gp if K_gp < K: Beta0s[j][K_gp:] = 0 - H1_0 = H0s[0] # *** - H2_0 = H0s[1] # *** - Beta1_0 = Beta0s[0] # *** - Beta2_0 = Beta0s[1] # *** - - return W0, H1_0, H2_0, Beta1_0, Beta2_0 # *** + return W0, H0s, Beta0s def fit(self): """Run NMFProfiler.""" @@ -1063,19 +1019,12 @@ class NMFProfiler: # Pre-process datasets (with min-max and number of features) self._preprocess_data() - omics = [self.omic1, self.omic2] # *** - # Initialize matrices - W0, H1_0, H2_0, Beta1_0, Beta2_0 = self._initialize_w_h_beta() # *** - - Hs_0 = [H1_0, H2_0] # *** - Betas_0 = [Beta1_0, Beta2_0] # *** + W0, Hs_0, Betas_0 = self._initialize_w_h_beta() self.W_init = W0 - self.H1_init = Hs_0[0] # *** - self.H2_init = Hs_0[1] # *** - self.Beta1_init = Betas_0[0] # *** - self.Beta2_init = Betas_0[1] # *** + self.Hs_init = Hs_0 + self.Betas_init = Betas_0 # Update lambda and gamma given sample size self._update_params() @@ -1086,8 +1035,8 @@ class NMFProfiler: # Create matrices and vectors to update using initialization # (and keep intact initialized matrices and vectors) W = deepcopy(W0) - Hs = [deepcopy(Hs_0[j]) for j in range(len(omics))] # *** - Betas = [deepcopy(Betas_0[j]) for j in range(len(omics))] # *** + Hs = [deepcopy(Hs_0[j]) for j in range(len(self.omics))] + Betas = [deepcopy(Betas_0[j]) for j in range(len(self.omics))] # Create lists of error terms to enrich iteratively (error, regul, nb_iters) = ([] for i in range(3)) @@ -1097,7 +1046,7 @@ class NMFProfiler: distorts, sparsitys, preds, - ) = ([[] for j in range(len(omics))] for _ in range(5)) # *** + ) = ([[] for j in range(len(self.omics))] for _ in range(5)) # Create lists of LDA performance indicators to enrich iteratively ( @@ -1105,18 +1054,15 @@ class NMFProfiler: BICs, AICs, F_pvals, - ) = ([[] for j in range(len(omics))] for _ in range(4)) # *** - bets = [[[] for k in range(K)] for j in range(len(omics))] # *** + ) = ([[] for j in range(len(self.omics))] for _ in range(4)) + bets = [[[] for k in range(K)] for j in range(len(self.omics))] - loss_init = _computeF( # *** - omics[0], - omics[1], + loss_init = _computeF( + self.omics, Y, W0, - Hs_0[0], - Hs_0[1], - Betas_0[0], - Betas_0[1], + Hs_0, + Betas_0, self.params["mu"], self.params["lambda"], self.params["gamma"], @@ -1125,20 +1071,18 @@ class NMFProfiler: ) error.append(loss_init[0]) nb_iters.append(0) - distorts[0].append(loss_init[1]) # *** - distorts[1].append(loss_init[2]) # *** - sparsitys[0].append(loss_init[3]) # *** - sparsitys[1].append(loss_init[4]) # *** - regul.append(loss_init[5]) # *** - preds[0].append(loss_init[6]) # *** - preds[1].append(loss_init[7]) # *** + regul.append(loss_init[3]) + for j in range(len(self.omics)): + distorts[j].append(loss_init[1][j]) + sparsitys[j].append(loss_init[2][j]) + preds[j].append(loss_init[4][j]) # Keep track of marginal objective functions in Hj with # initial matrices (necessary for linesearch first execution) - for j in range(len(omics)): # *** + for j in range(len(self.omics)): margHs[j].append( _computeMargH( - omics[j], + self.omics[j], Y, W0, Hs_0[j], @@ -1150,13 +1094,14 @@ class NMFProfiler: ) # LDAs with initial matrices - regs_init = [0 for j in range(len(omics))] # *** - for j in range(len(omics)): # *** + regs_init = [0 for j in range(len(self.omics))] + for j in range(len(self.omics)): regs_init[j] = sm.OLS( - self.y, sm.add_constant(safe_sparse_dot(omics[j], Hs_0[j].T)) + self.y, + sm.add_constant(safe_sparse_dot(self.omics[j], Hs_0[j].T)) ).fit() - for j in range(len(omics)): # *** + for j in range(len(self.omics)): for k in range(K): bets[j][k].append(regs_init[j].params[k+1]) R2adjs[j].append(regs_init[j].rsquared_adj) @@ -1170,8 +1115,8 @@ class NMFProfiler: # To keep track of the choice of etas etas_init = [self.params["eta1"], self.params["eta2"]] # *** - etas = [[] for j in range(len(omics))] # *** - for j in range(len(omics)): # *** + etas = [[] for j in range(len(self.omics))] + for j in range(len(self.omics)): etas[j].append(etas_init[j]) # Begin the optimization @@ -1182,35 +1127,26 @@ class NMFProfiler: # Analytical solver... if self.solver == "analytical": - ( # *** + ( W, - Hs[0], - Hs[1], - Betas[0], - Betas[1], + Hs, + Betas, self.params["eta1"], self.params["eta2"], - margHs[0], - margHs[1], - gradHs[0], - gradHs[1], + margHs, + gradHs, ) = _analytic_solver( - omics[0], - omics[1], + self.omics, Y, W, - Hs[0], - Hs[1], - Betas[0], - Betas[1], + Hs, + Betas, self.params, as_sklearn=self.as_sklearn, backtrack=self.backtrack, max_iter_back=self.max_iter_back, - H1_loss=margHs[0], - H2_loss=margHs[1], - H1_grad=gradHs[0], - H2_grad=gradHs[1], + Hlosses=margHs, + Hgrads=gradHs, K=K, K_gp=K_gp, verbose=self.verbose, @@ -1218,7 +1154,7 @@ class NMFProfiler: # ... or Autograd solver else: - W, Hs[0], Hs[1], Betas[0], Betas[1] = _autograd_solver() # *** + W, Hs, Betas = _autograd_solver() # Keep track of optimal etas etas_current = [self.params["eta1"], self.params["eta2"]] # *** @@ -1226,15 +1162,12 @@ class NMFProfiler: etas[j].append(etas_current[j]) # *** # Compute the new loss as well as specific terms - loss_ = _computeF( # *** - omics[0], - omics[1], + loss_ = _computeF( + self.omics, Y, W, - Hs[0], - Hs[1], - Betas[0], - Betas[1], + Hs, + Betas, self.params["mu"], self.params["lambda"], self.params["gamma"], @@ -1243,22 +1176,21 @@ class NMFProfiler: ) error.append(loss_[0]) nb_iters.append(n_iter) - distorts[0].append(loss_[1]) # *** - distorts[1].append(loss_[2]) # *** - sparsitys[0].append(loss_[3]) # *** - sparsitys[1].append(loss_[4]) # *** - regul.append(loss_[5]) # *** - preds[0].append(loss_[6]) # *** - preds[1].append(loss_[7]) # *** + regul.append(loss_[3]) + for j in range(len(self.omics)): + distorts[j].append(loss_[1][j]) + sparsitys[j].append(loss_[2][j]) + preds[j].append(loss_[4][j]) # Monitor the LDA part - regs = [0 for j in range(len(omics))] # *** - for j in range(len(omics)): # *** + regs = [0 for j in range(len(self.omics))] + for j in range(len(self.omics)): regs[j] = sm.OLS( - self.y, sm.add_constant(safe_sparse_dot(omics[j], Hs[j].T)) + self.y, + sm.add_constant(safe_sparse_dot(self.omics[j], Hs[j].T)) ).fit() - for j in range(len(omics)): + for j in range(len(self.omics)): for k in range(K): bets[j][k].append(regs[j].params[k+1]) R2adjs[j].append(regs[j].rsquared_adj) @@ -1309,15 +1241,13 @@ class NMFProfiler: # -------------------------------- self.n_iter = n_iter self.W = W - self.H1 = Hs[0] # *** - self.H2 = Hs[1] # *** - self.Beta1 = Betas[0] # *** - self.Beta2 = Betas[1] # *** + self.Hs = Hs + self.Betas = Betas # Keep track of final Hj gradients and build the Hj gradients matrix # (following up their evolution during optimization) # ------------------------------------------------------------------ - for j in range(len(omics)): # *** + for j in range(len(self.omics)): gradHs[j].append( multi_dot([W.T, W, Hs[j]]) + ( @@ -1327,34 +1257,36 @@ class NMFProfiler: np.transpose(Betas[j][newaxis]), Betas[j][newaxis], Hs[j], - omics[j].T, - omics[j], + self.omics[j].T, + self.omics[j], ] ) ) - ( - safe_sparse_dot(W.T, omics[j]) + safe_sparse_dot(W.T, self.omics[j]) + ( self.params["gamma"] * multi_dot( [ np.transpose(Betas[j][newaxis]), self.y[newaxis], - omics[j] + self.omics[j] ] ) ) ) ) - grads = np.array( # *** + grads = np.array( [ - [norm(i) ** 2 for i in gradHs[j]] for j in range(len(omics)) + [ + norm(i) ** 2 for i in gradHs[j] + ] for j in range(len(self.omics)) ] ).T self.df_grads = pd.DataFrame( grads, - columns=[f'grad_H{j+1}' for j in range(len(omics))] # *** + columns=[f'grad_H{j+1}' for j in range(len(self.omics))] ) # Build the error terms matrix @@ -1373,10 +1305,10 @@ class NMFProfiler: error_terms, columns=list(itertools.chain.from_iterable([ ["iter"], - [f"distort{j+1}" for j in range(len(omics))], # *** - [f"sparsity{j+1}" for j in range(len(omics))], # *** + [f"distort{j+1}" for j in range(len(self.omics))], + [f"sparsity{j+1}" for j in range(len(self.omics))], ["regul"], - [f"pred{j+1}" for j in range(len(omics))], # *** + [f"pred{j+1}" for j in range(len(self.omics))], ["loss"], ])), ) @@ -1384,7 +1316,7 @@ class NMFProfiler: # Build the LDA performance matrix # -------------------------------- LDA_perf = np.vstack(nb_iters) - for j in range(len(omics)): + for j in range(len(self.omics)): bets_j = np.array(bets[j]).T perf_j = np.array( [ @@ -1407,7 +1339,7 @@ class NMFProfiler: f"BIC (omic{j+1})", f"AIC (omic{j+1})", f"F-statistic p-value (omic{j+1})" - ] for j in range(len(omics))])), # *** + ] for j in range(len(self.omics))])), ])), ) @@ -1416,7 +1348,7 @@ class NMFProfiler: # ------------------------------------------------- self.df_etas = pd.DataFrame( np.array(etas).T, - columns=[f"eta_omic{j+1}" for j in range(len(omics))] # *** + columns=[f"eta_omic{j+1}" for j in range(len(self.omics))] ) return self @@ -1436,8 +1368,6 @@ class NMFProfiler: in each omic. :projs: list of arrays. Projection onto each Hj matrix """ - Hs = [self.H1, self.H2] # *** - # Initialize projection list and compute projs = [] for j in len(new_ind): @@ -1445,7 +1375,7 @@ class NMFProfiler: new_ind_Xj = new_ind[j][newaxis] # Compute projections of the new samples onto dictionary matrices - projs.append(safe_sparse_dot(new_ind_Xj, Hs[j].T)) + projs.append(safe_sparse_dot(new_ind_Xj, self.Hs[j].T)) # For each omic, find which component gave the highest score group = [projs[j].argmax() for j in len(new_ind)] @@ -1475,7 +1405,7 @@ class NMFProfiler: ------ Return a barplot of the different error terms. """ - J = len([self.omic1, self.omic2]) # *** + J = len(self.omics) data_barplot = np.array([ [ *[f"reconstruction omic{j+1}" for j in range(J)], @@ -1546,15 +1476,15 @@ class NMFProfiler: """ plt.figure(figsize=(width, height)) - omics = [self.omic1, self.omic2] # *** - Hs = [self.H1, self.H2] # *** - if obj_to_viz == "W": sns.heatmap(pd.DataFrame(self.W), cmap="viridis") elif obj_to_viz == "omic": - sns.heatmap(pd.DataFrame(omics[(omic_number-1)]), cmap="viridis") + sns.heatmap( + pd.DataFrame(self.omics[(omic_number-1)]), + cmap="viridis" + ) elif obj_to_viz == "H": - sns.heatmap(pd.DataFrame(Hs[(omic_number-1)]), cmap="viridis") + sns.heatmap(pd.DataFrame(self.Hs[(omic_number-1)]), cmap="viridis") else: raise Exception( "Cannot plot this object, please change 'obj_to_viz' input." diff --git a/nmfprofiler/utils/validate_inputs.py b/nmfprofiler/utils/validate_inputs.py index e817e12..51a94ef 100644 --- a/nmfprofiler/utils/validate_inputs.py +++ b/nmfprofiler/utils/validate_inputs.py @@ -22,7 +22,7 @@ def _check_modalities(input, modalities): def _check_inputs_NMFProfiler(X): """Check inputs of NMFProfiler (i.e. omic1, omic2, and so on). """ - params_names = [ + params_names = [ # *** "alpha", "eta1", "eta2", @@ -43,20 +43,19 @@ def _check_inputs_NMFProfiler(X): 'nndsvdar' ] - omics = [X.omic1, X.omic2] - for j in range(len(omics)): - if _check_type(omics[j], np.ndarray) is False: + for j in range(len(X.omics)): + if _check_type(X.omics[j], np.ndarray) is False: raise TypeError( - f"Received {type(omics[j])}, " + f"Received {type(X.omics[j])}, " f"item number {j+1} in 'omics' (i.e. omic{j+1}) " "should be a numpy.ndarray." ) - if omics[0].shape[0] != omics[j].shape[0]: # check dimensions + if X.omics[0].shape[0] != X.omics[j].shape[0]: # check dimensions raise Exception( "All datasets should have the exact same samples, but " - f"omic1 (item 1 in 'omics') has {omics[0].shape[0]} samples " + f"omic1 (item 1 in 'omics') has {X.omics[0].shape[0]} samples " f"and omic{j+1} (item {j+1} in 'omics') has " - f"{omics[j].shape[0]} samples. Please check datasets." + f"{X.omics[j].shape[0]} samples. Please check datasets." ) if _check_type(X.y, np.ndarray) is False: @@ -64,10 +63,10 @@ def _check_inputs_NMFProfiler(X): f"Received {type(X.y)}, " "'y' should be a 1D numpy.ndarray." ) - if X.y.shape[0] != omics[0].shape[0]: # check dimensions + if X.y.shape[0] != X.omics[0].shape[0]: # check dimensions raise Exception( f"Group vector y has {X.y.shape[0]} samples but " - f"omic1 (item 1 in 'omics') has {omics[0].shape[0]} samples. " + f"omic1 (item 1 in 'omics') has {X.omics[0].shape[0]} samples. " "Please check datasets." ) if len(np.unique(X.y)) < 2: # check y values @@ -85,7 +84,7 @@ def _check_inputs_NMFProfiler(X): "One or more essential key-value pair(s) missing. " "See documentation." ) - if len(X.params) > 8: + if len(X.params) > 8: # *** raise ValueError( "Extra key-value pair(s), 'params' should be of length 8. " "See documentation." @@ -96,11 +95,11 @@ def _check_inputs_NMFProfiler(X): "'alpha' should be a positive integer." ) - etas = [X.params['eta1'], X.params['eta2']] - for j in range(len(etas)): - if _check_type(etas[j], float) is False or etas[j] <= 0: + etas = [X.params['eta1'], X.params['eta2']] # *** + for j in range(len(etas)): # *** + if _check_type(etas[j], float) is False or etas[j] <= 0: # *** raise TypeError( - f"Received {etas[j]} {type(etas[j])}, " + f"Received {etas[j]} {type(etas[j])}, " # *** f"eta {j+1} (item {j+1} in 'params['etas']') " "should be a positive float." ) -- GitLab From ff567aca6974e13bd46ea5b7479aff5a321120cf Mon Sep 17 00:00:00 2001 From: Aurelie Mercadie <aurelie.mercadie@inrae.fr> Date: Wed, 27 Nov 2024 16:46:04 +0100 Subject: [PATCH 13/19] adapted attributes --- tests/test_nmfprofiler.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_nmfprofiler.py b/tests/test_nmfprofiler.py index c096c55..ab5bde3 100644 --- a/tests/test_nmfprofiler.py +++ b/tests/test_nmfprofiler.py @@ -662,7 +662,7 @@ seed = 240709 def test_nmfprofiler_mu(): - model = NMFProfiler(omic1=omic1, omic2=omic2, y=y, seed=seed) + model = NMFProfiler(omics=[omic1, omic2], y=y, seed=seed) res = model.fit() assert ( np.round(res.df_errors.iloc[-1, -1], decimals=4) == 0.9604 @@ -671,8 +671,7 @@ def test_nmfprofiler_mu(): def test_nmfprofiler_prox(): model = NMFProfiler( - omic1=omic1, - omic2=omic2, + omics=[omic1, omic2], y=y, seed=seed, as_sklearn=False, -- GitLab From 1e8bbd46ee6190173e5ee4c45972289803a54c74 Mon Sep 17 00:00:00 2001 From: Aurelie Mercadie <aurelie.mercadie@inrae.fr> Date: Wed, 27 Nov 2024 16:50:05 +0100 Subject: [PATCH 14/19] updated README given attribute changes --- README.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 644ae73..b87d9e2 100644 --- a/README.md +++ b/README.md @@ -29,8 +29,7 @@ seed = 240820 # Run NMFProfiler model = NMFProfiler( - omic1=ToyExample().omic1, - omic2=ToyExample().omic2, + omics=[ToyExample().omic1, ToyExample().omic2], y=ToyExample().y, seed=seed, as_sklearn=False, @@ -42,8 +41,8 @@ print(res) # Visualize analyzed datasets (samples x features) ToyExample().y # 2 groups -res.heatmap(obj_to_viz="omic1", width=15, height=6, path="") -res.heatmap(obj_to_viz="omic2", width=15, height=6, path="") +res.heatmap(obj_to_viz="omic", width=15, height=6, path="", omic_number=1) +res.heatmap(obj_to_viz="omic", width=15, height=6, path="", omic_number=2) ```  @@ -60,8 +59,8 @@ res.heatmap(obj_to_viz="W", width=10, height=10, path="") ```python # Visualize signature matrices H1 and H2 obtained (2 x features) -res.heatmap(obj_to_viz="H1", width=15, height=6, path="") -res.heatmap(obj_to_viz="H2", width=15, height=6, path="") +res.heatmap(obj_to_viz="H", width=15, height=6, path="", omic_number=1) +res.heatmap(obj_to_viz="H", width=15, height=6, path="", omic_number=2) ```  -- GitLab From 2c8a631db23a83ae2c7de5fc4de69972f128f193 Mon Sep 17 00:00:00 2001 From: Aurelie Mercadie <aurelie.mercadie@inrae.fr> Date: Wed, 27 Nov 2024 17:44:56 +0100 Subject: [PATCH 15/19] reduced set of params + adapted regarding eta(s) --- nmfprofiler/nmfprofiler.py | 34 ++++++++++++---------------- nmfprofiler/utils/validate_inputs.py | 24 ++++++++------------ 2 files changed, 24 insertions(+), 34 deletions(-) diff --git a/nmfprofiler/nmfprofiler.py b/nmfprofiler/nmfprofiler.py index 0380bdd..f1c092e 100644 --- a/nmfprofiler/nmfprofiler.py +++ b/nmfprofiler/nmfprofiler.py @@ -415,6 +415,7 @@ def _analytic_solver( Hs, Betas, params, + current_etas, as_sklearn, backtrack, max_iter_back, @@ -455,6 +456,7 @@ def _analytic_solver( `alpha`, `sigma`, `m_back` for linesearch(). By default, gamma = 0.005, lambda = 1e-3, mu = 1e-3, eta1 = eta2 = 1, alpha = 2, sigma = 1e-9 and m_back = 1. + :current_etas: DESCRIBE. :as_sklearn: boolean, whether or not a modified version of the MU updates from scikit-learn is used. :backtrack: boolean, if Backtrack LineSearch is performed or not. @@ -493,8 +495,6 @@ def _analytic_solver( See (Fevotte, 2011) and sklearn.decomposition.NMF source code for choice of parameters. """ - etas = [params["eta1"], params["eta2"]] # *** - # W update with new values of Hs W_new = _update_W(W, omics, Hs, params["mu"]) @@ -508,7 +508,7 @@ def _analytic_solver( Betas[j], Y, params["gamma"], - etas[j], + current_etas[j], params["lambda"], as_sklearn=as_sklearn, grad=True, @@ -530,7 +530,7 @@ def _analytic_solver( K_gp ) ) - Hs_new[j], Hlosses[j], etas[j] = _linesearch( + Hs_new[j], Hlosses[j], current_etas[j] = _linesearch( H_old=Hs[j], H=Hs_new[j], W=W_new, @@ -539,7 +539,7 @@ def _analytic_solver( Y=Y, gamma=params["gamma"], lambdA=params["lambda"], - current_eta=etas[j], + current_eta=current_etas[j], H_loss=Hlosses[j], alpha=params["alpha"], sigma=params["sigma"], @@ -569,15 +569,11 @@ def _analytic_solver( if K_gp < K: Betas_new[j][K_gp:] = 0 - params["eta1"] = etas[0] # *** - params["eta2"] = etas[1] # *** - return ( W_new, Hs_new, Betas_new, - params["eta1"], # *** - params["eta2"], # *** + current_etas, Hlosses, Hgrads, ) @@ -806,12 +802,11 @@ class NMFProfiler: self, omics, y, - params={ # *** + params={ "gamma": 1e-2, "lambda": 1e-3, "mu": 1e-3, - "eta1": 1.00, - "eta2": 1.00, + "eta": 1.00, "alpha": 2, "sigma": 1e-9, "m_back": 1, @@ -1114,7 +1109,7 @@ class NMFProfiler: print("Error after initialization step: %f" % (error[0])) # To keep track of the choice of etas - etas_init = [self.params["eta1"], self.params["eta2"]] # *** + etas_init = [self.params["eta"] for j in range(len(self.omics))] etas = [[] for j in range(len(self.omics))] for j in range(len(self.omics)): etas[j].append(etas_init[j]) @@ -1131,8 +1126,7 @@ class NMFProfiler: W, Hs, Betas, - self.params["eta1"], - self.params["eta2"], + current_etas, margHs, gradHs, ) = _analytic_solver( @@ -1142,6 +1136,9 @@ class NMFProfiler: Hs, Betas, self.params, + current_etas=[ + etas[j][-1] for j in range(len(self.omics)) + ], as_sklearn=self.as_sklearn, backtrack=self.backtrack, max_iter_back=self.max_iter_back, @@ -1157,9 +1154,8 @@ class NMFProfiler: W, Hs, Betas = _autograd_solver() # Keep track of optimal etas - etas_current = [self.params["eta1"], self.params["eta2"]] # *** - for j in range(len(etas_current)): # *** - etas[j].append(etas_current[j]) # *** + for j in range(len(self.omics)): + etas[j].append(current_etas[j]) # Compute the new loss as well as specific terms loss_ = _computeF( diff --git a/nmfprofiler/utils/validate_inputs.py b/nmfprofiler/utils/validate_inputs.py index 51a94ef..7e221e8 100644 --- a/nmfprofiler/utils/validate_inputs.py +++ b/nmfprofiler/utils/validate_inputs.py @@ -22,10 +22,9 @@ def _check_modalities(input, modalities): def _check_inputs_NMFProfiler(X): """Check inputs of NMFProfiler (i.e. omic1, omic2, and so on). """ - params_names = [ # *** + params_names = [ "alpha", - "eta1", - "eta2", + "eta", "gamma", "lambda", "m_back", @@ -84,9 +83,9 @@ def _check_inputs_NMFProfiler(X): "One or more essential key-value pair(s) missing. " "See documentation." ) - if len(X.params) > 8: # *** + if len(X.params) > 7: raise ValueError( - "Extra key-value pair(s), 'params' should be of length 8. " + "Extra key-value pair(s), 'params' should be of length 7. " "See documentation." ) if _check_type(X.params['alpha'], int) is False or X.params['alpha'] < 1: @@ -94,16 +93,11 @@ def _check_inputs_NMFProfiler(X): f"Received {X.params['alpha']} {type(X.params['alpha'])}, " "'alpha' should be a positive integer." ) - - etas = [X.params['eta1'], X.params['eta2']] # *** - for j in range(len(etas)): # *** - if _check_type(etas[j], float) is False or etas[j] <= 0: # *** - raise TypeError( - f"Received {etas[j]} {type(etas[j])}, " # *** - f"eta {j+1} (item {j+1} in 'params['etas']') " - "should be a positive float." - ) - + if _check_type(X.params['eta'], float) is False or X.params['eta'] <= 0: + raise TypeError( + f"Received {X.params['eta']} {type(X.params['eta'])}, " + "'eta' should be a positive float." + ) if _check_type(X.params['gamma'], float) is False or X.params['gamma'] < 0: raise TypeError( f"Received {X.params['gamma']} {type(X.params['gamma'])}, " -- GitLab From 783b65df0eceb75f815d356b11bfc316de1e0419 Mon Sep 17 00:00:00 2001 From: cebrouard <celinebrouard@MacBook-Pro-de-cebrouard.local> Date: Thu, 28 Nov 2024 14:17:18 +0100 Subject: [PATCH 16/19] changed all documentations for more than 2 omics --- nmfprofiler/nmfprofiler.py | 179 ++++++++++++++----------------------- 1 file changed, 66 insertions(+), 113 deletions(-) diff --git a/nmfprofiler/nmfprofiler.py b/nmfprofiler/nmfprofiler.py index f1c092e..bdf781e 100644 --- a/nmfprofiler/nmfprofiler.py +++ b/nmfprofiler/nmfprofiler.py @@ -47,15 +47,10 @@ def _update_W(W, omics, Hs, mu): ---------- :W: ndarray of shape (n_samples x K), contributions of individuals to latent components. - :omic1: ndarray of shape (n_samples x n_features_omic1), values of - features from (omic1) measured on (n_samples). - :omic2: ndarray of shape (n_samples x n_features_omic2), values of - features from (omic2) measured on the same (n_samples) samples as - (omic1). - :H1: ndarray of shape (K x n_features_omic1), latent components - built on (omic1). - :H2: ndarray of shape (K x n_features_omic2), latent components built on - (omic2). + :omics: list of (n_omics) ndarrays of shape (n_samples x n_features_omicj), values of + features from (omicj) measured on the same (n_samples) samples. + :Hs: list of (n_omics) ndarrays of shape (K x n_features_omicj), latent components + built on (omicj). :mu: float, value for parameter `mu` from objective function. @@ -87,7 +82,7 @@ def _update_H( H_grad=None ): """Update the matrix containing latent components built on omic j. - (j referring to the omic dataset number, either 1 or 2) + (j referring to the omic dataset number, between 1 and J) Parameters ---------- @@ -156,7 +151,7 @@ def _update_H( def _update_Beta(omic, h_k, y_k, gamma, Beta): """Update the regression coefficients linked to omic j and component k. - (j referring to the omic dataset number, either 1 or 2) + (j referring to the omic dataset number, between 1 and J) Parameters ---------- @@ -204,7 +199,7 @@ def _computeF( K_gp, details=False ): - """Compute the value of F given omic1, omic2, Y, W, H1, H2, Beta1, Beta2 + """Compute the value of F given Y, W, each omicj, Hj, Betaj and some hyperparameters (i.e. gamma, lambda and mu). Calculate the objective function value, as well as each error term, @@ -212,23 +207,16 @@ def _computeF( Parameters ---------- - :omic1: ndarray of shape (n_samples x n_features_omic1), values of - features from (omic1) measured on (n_samples). - :omic2: ndarray of shape (n_samples x n_features_omic2), values of - features from (omic2) measured on the same (n_samples) samples - as (omic1). + :omics: list of (n_omics) ndarrays of shape (n_samples x n_features_omicj), values of + features from (omicj) measured on the same (n_samples) samples. :Y: ndarray of shape (n_samples x U), one-hot encoding of (y) indicating to which group each sample belongs. :W: ndarray of shape (n_samples x K), contributions of individuals to latent components. - :H1: ndarray of shape (K x n_features_omic1), latent components built on - (omic1). - :H2: ndarray of shape (K x n_features_omic2), latent components built on - (omic2). - :Beta1: ndarray of shape (K x 1), regression coefficients for projection - of individuals from (omic1) onto latent components from H^(1). - :Beta2: ndarray of shape (K x 1), regression coefficients for projection - of individuals from (omic2) onto latent components from H^(2). + :Hs: list of (n_omics) ndarrays of shape (K x n_features_omicj), latent components + built on (omicj). + :Betas: list of (n_omics) ndarrays of shape (K x 1), regression coefficients for projection + of individuals from (omicj) onto latent components from H^(j). :mu: float, value for parameter `mu` from objective function. :lambdA: float, value for parameter `lambda` from objective function. :gamma: float, value for parameter `gamma` from objective function. @@ -329,7 +317,7 @@ def _linesearch( """Find the most suited value for eta^(j). Calculate the optimal gradient descent step size eta^(j) to update H^(j) - (j = 1,2). + (j = 1,...,J). Instead of reuse each time the initial eta_0^(j), use the last one found, eta_(t-1)^(j). @@ -434,37 +422,28 @@ def _analytic_solver( Parameters ---------- - :omic1: ndarray of shape (n_samples x n_features_omic1), values of - features from (omic1) measured on (n_samples). - :omic2: ndarray of shape (n_samples x n_features_omic2), values of - features from (omic2) measured on the same (n_samples) samples as - (omic1). + :omics: list of (n_omics) ndarrays of shape (n_samples x n_features_omicj), values of + features from (omicj) measured on the same (n_samples) samples. :Y: ndarray of shape (n_samples x U), one-hot encoding of (y) indicating to which group each sample belongs. :W: ndarray of shape (n_samples x K), contributions of individuals to latent components, initialized or at (t). - :H1: ndarray of shape (K x n_features_omic1), latent components built on - (omic1), initialized or at (t). - :H2: ndarray of shape (K x n_features_omic2), latent components built on - (omic2), initialized or at (t). - :Beta1: ndarray of shape (K x 1), regression coefficients for projection - of individuals from (omic1) onto (H1), initialized or at (t). - :Beta2: ndarray of shape (K x 1), regression coefficients for projection - of individuals from (omic2) onto (H2), initialized or at (t). - :params: dict of length 8 (optional), values for parameters `gamma`, - `lambda`, `mu` from objective function, `eta1`, `eta2` for prox, and + :Hs: list of (n_omics) ndarrays of shape (K x n_features_omicj), latent components + built on (omicj), initialized or at (t). + :Betas: list of (n_omics) ndarrays of shape (K x 1), regression coefficients for projection + of individuals from (omicj) onto H^(j),initialized or at (t). + :params: dict of length 7 (optional), values for parameters `gamma`, + `lambda`, `mu` from objective function, `eta` for prox, and `alpha`, `sigma`, `m_back` for linesearch(). - By default, gamma = 0.005, lambda = 1e-3, mu = 1e-3, eta1 = eta2 = 1, + By default, gamma = 0.005, lambda = 1e-3, mu = 1e-3, eta = 1, alpha = 2, sigma = 1e-9 and m_back = 1. - :current_etas: DESCRIBE. + :current_etas: list of (n_omics) floats, value of parameter `etaj`. :as_sklearn: boolean, whether or not a modified version of the MU updates from scikit-learn is used. :backtrack: boolean, if Backtrack LineSearch is performed or not. :max_iter_back: int, maximum number of iterations for the backtrack. - :H1_loss: list, values of the marginal loss for H^(1). - :H2_loss: list, values of the marginal loss for H^(2). - :H1_grad: list, values of gradient in H^(1) (before update). - :H2_grad: list, values of gradient in H^(2) (before update). + :Hlosses: list of (n_omics) lists, vsalues of the marginal loss for H^(j). + :Hgrads: list of (n_omics) lists, values of gradient in H^(j) (before update). :K: int (optional), number of latent components to build. By default, K = 2. :K_gp: int, number of components dedicated to groups profiling. @@ -474,19 +453,12 @@ def _analytic_solver( Returns ------- :W_new: ndarray of shape (n_samples x K), updated, or (t+1), W matrix. - :H1_new: ndarray of shape (K x n_features_omic1), updated, or (t+1), - H1 matrix. - :H2_new: ndarray of shape (K x n_features_omic2), updated, or (t+1), - H2 matrix. - :Beta1_new: vector of length (K), updated, or (t+1), Beta1 vector. - :Beta2_new: vector of length (K), updated, or (t+1), Beta2 vector. - :params[`eta1`]: float, updated value of parameter `eta1`. - :params[`eta2`]: float, updated value of parameter `eta2`. - :H1_loss: list, augmented list of marginal losses for H^(1). - :H2_loss: list, augmented list of marginal losses for H^(2). - :H1_grad: list, augmented list of gradient values in H^(1) - (before update). - :H2_grad: list, augmented list of gradient values in H^(2) + :Hs_new: list of (n_omics) ndarrays of shape (K x n_features_omicj), updated, or (t+1), + Hj matrices. + :Betas_new: list of (n_omics) vectors of length (K), updated, or (t+1), Betaj vectors. + :current_etas: list of (n_omics) floats, updated value of parameter `etaj`. + :Hlosses: list of (n_omics) lists, augmented list of marginal losses for H^(j). + :Hgrads: list of (n_omics) lists, augmented list of gradient values in H^(j) (before update). @@ -597,48 +569,42 @@ class NMFProfiler: The goal of the method is to find relationships between OMICS corresponding to typical profiles of distinct groups of individuals. The - objective is to find two decompositions, one for each omic, with a common + objective is to find one decomposition for each omic, with a common contribution of individuals, in which latent factor matrices are sparse. The objective function :math:`\mathcal{F} - (\mathbf{W},\mathbf{H}^{(1)},\mathbf{H}^{(2)},\beta^{(1)},\beta^{(2)})` + (\mathbf{W},\mathbf{H}^{(1)},\ldots,\mathbf{H}^{(J)},\beta^{(1)},\ldots,\beta^{(J)})` is as follows: .. math:: - & \dfrac{1}{2}\left( \sum_{j=1}^2\| \mathbf{X}^{(j)} - + & \dfrac{1}{2}\left( \sum_{j=1}^J\| \mathbf{X}^{(j)} - \mathbf{WH}^{(j)} \|_F^2 \right) - &+ \dfrac{\gamma}{2}\left( \sum_{j=1}^2\| \mathbf{Y} - + &+ \dfrac{\gamma}{2}\left( \sum_{j=1}^J\| \mathbf{Y} - \mathbf{X}^{(j)} \mathbf{H}^{(j)\top} \text{Diag}(\beta^{(j)}) \|_F^2 \right) - &+ \sum_{j=1}^{2} \lambda\|\mathbf{H}^{(j)}\|_1 + + &+ \sum_{j=1}^{J} \lambda\|\mathbf{H}^{(j)}\|_1 + \dfrac{\mu}{2}\|\mathbf{W \|_F^2} Parameters ---------- - :omic1: array-like of shape (n_samples x n_features_omic1). - First omics dataset. (n_samples) is the number of samples and - (n_features_omic1) the number of features. - - :omic2: array-like of shape (n_samples x n_features_omic2). - Second omics dataset. (n_samples) is the number of samples and - (n_features_omic2) is the number of features. - WARNING: (omic2) must contain the exact same samples in the same order - as (omic1). + :omics: list of (n_omics) array-like of shape (n_samples x n_features_omicj). + Omics datasets. (n_samples) is the number of samples and + (n_features_omicj) the number of features of (omicj). + WARNING: each (omicj) must contain the exact same samples in the same order. :y: vector of length (n_samples). - Group to which each sample belongs (same order than the rows of omic1 - and omic2). + Group to which each sample belongs (same order than the rows of omicj). - :params: dict of length 8, optional. + :params: dict of length 7, optional. Contains, in this order, values for hyperparameters `gamma`, `lambda`, - `mu` (from the objective function), for `eta1`, `eta2` (when proximal + `mu` (from the objective function), for `eta` (when proximal optimization is used), and for `alpha`, `sigma`, `m_back` (for `linesearch()`). - By default, gamma = 1e-2, lambda = 1e-3, mu = 1e-3, eta1 = eta2 = 1, + By default, gamma = 1e-2, lambda = 1e-3, mu = 1e-3, eta = 1, alpha = 2, sigma = 1e-9, and m_back = 1. In the objective function, `lambda` and `gamma` are additionally multiplied by (n_samples). @@ -694,44 +660,31 @@ class NMFProfiler: Attributes ---------- - :W: ndarray of shape (n_samples x 2). + :W: ndarray of shape (n_samples x K). Contributions of individuals in each latent component. - :W_init: ndarray of shape (n_samples x 2). + :W_init: ndarray of shape (n_samples x K). Initial version of (W). - :H1: ndarray of shape (2 x n_features_omic1). - Latent components for (omic1). - - :H1_init: ndarray of shape (2 x n_features_omic1). - Initial version of (H1). - - :H2: ndarray of shape (2 x n_features_omic2). - Latent components for (omic2). - - :H2_init: ndarray of shape (2 x n_features_omic2). - Initial version of (H2). - - :Beta1: ndarray of shape (2 x 1). - Regression coefficients for the projection of individuals from (omic1) - onto (H1). + :Hs: list of (n_omics) ndarrays of shape (K x n_features_omicj). + Latent components Hj for (omicj). - :Beta1_init: ndarray of shape (2 x 1). - Initial version of (Beta1). + :Hs_init: list of (n_omics) ndarrays of shape (K x n_features_omicj). + Initial version of (Hj). - :Beta2: ndarray of shape (K x 1). - Regression coefficients for the projection of individuals from (omic2) - onto (H2). + :Betas: list of (n_omics) ndarrays of shape (K x 1). + Regression coefficients for the projection of individuals from (omicj) + onto (Hj). - :Beta2_init: ndarray of shape (K x 1). - Initial version of (Beta2). + :Betas_init: list of (n_omics) ndarrays of shape (K x 1). + Initial version of (Betaj). :n_iter: int. Final number of iterations (up to convergence or maximum number of iterations is reached). - :df_etas: `pd.dataFrame` of shape (n_iter+1, 2). - Optimal values for parameters `eta1` and `eta2` at each iteration. + :df_etas: `pd.dataFrame` of shape (n_iter+1, n_omics). + Optimal values for parameter `eta` at each iteration. :df_errors: `pd.dataFrame` of shape (n_iter+1, 9). All error terms for each iteration and omic j. @@ -739,8 +692,8 @@ class NMFProfiler: :df_ldaperf: `pd.DataFrame` of shape (n_iter+1, 13). All metrics linked to LDA at each iteration and omic j. - :df_grads: `pd.DataFrame` of shape (n_iter+1, 2) - Values of H^(1) and H^(2) gradients before being updated, at each + :df_grads: `pd.DataFrame` of shape (n_iter+1, n_omics) + Values of H^(j) gradients before being updated, at each iteration. :runningtime: float. @@ -791,7 +744,7 @@ class NMFProfiler: >>> y = np.array([1, 1, 1, 0, 0, 0]) >>> seed = 240805 >>> from nmfprofiler import NMFProfiler - >>> model = NMFProfiler(omic1=X1, omic2=X2, y=y, seed=seed) + >>> model = NMFProfiler(omics=[X1, X2], y=y, seed=seed) >>> res = model.fit() >>> print(res) >>> res.heatmap(obj_to_viz="W", height=10, width=10, path="") @@ -884,8 +837,8 @@ class NMFProfiler: def _initialize_w_h_beta(self): """Initialize matrices W, H^j and Beta^j. - Several ways to intialize W, H1, H2. - Beta1 and Beta2 are initialized with 1s vectors. + Several ways to intialize W, Hj. + Betaj are initialized with 1s vectors. Note ---- @@ -898,7 +851,7 @@ class NMFProfiler: # dedicated to groups, identically to K K_gp = len(np.unique(self.y)) - # For W, H1 and H2: + # For W and Hj: # 1.0. Initialize H0s list H0s = [0 for j in range(len(self.omics))] # 1.1. Concatenate omics data sets @@ -923,7 +876,7 @@ class NMFProfiler: del _ elif self.init_method == "random3": # 1.2b. FOR IDENTICAL OMICS DATA SETS - Initialize W - # with both omics, H1 with omic1 and H2 as H1 (random) + # with both omics, H1 with omic1 and other Hj as H1 (random) W0, H0s[0] = _initialize_nmf( X=self.omics[0], n_components=K, @@ -1356,7 +1309,7 @@ class NMFProfiler: Params ------ :new_ind: list. List of arrays containing values of - features from omic1 and omic2 for a new sample. + features from omicj for a new sample. Values ------ -- GitLab From 2e4af3828d07b6154820940f13a7d0cedeed940d Mon Sep 17 00:00:00 2001 From: Aurelie Mercadie <aurelie.mercadie@inrae.fr> Date: Fri, 29 Nov 2024 10:43:59 +0100 Subject: [PATCH 17/19] linter corrections --- nmfprofiler/nmfprofiler.py | 60 ++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 25 deletions(-) diff --git a/nmfprofiler/nmfprofiler.py b/nmfprofiler/nmfprofiler.py index bdf781e..83a4ef5 100644 --- a/nmfprofiler/nmfprofiler.py +++ b/nmfprofiler/nmfprofiler.py @@ -47,10 +47,11 @@ def _update_W(W, omics, Hs, mu): ---------- :W: ndarray of shape (n_samples x K), contributions of individuals to latent components. - :omics: list of (n_omics) ndarrays of shape (n_samples x n_features_omicj), values of - features from (omicj) measured on the same (n_samples) samples. - :Hs: list of (n_omics) ndarrays of shape (K x n_features_omicj), latent components - built on (omicj). + :omics: list of (n_omics) ndarrays of shape (n_samples x n_features_omicj), + values of features from (omicj) measured on the same + (n_samples) samples. + :Hs: list of (n_omics) ndarrays of shape (K x n_features_omicj), + latent components built on (omicj). :mu: float, value for parameter `mu` from objective function. @@ -207,16 +208,18 @@ def _computeF( Parameters ---------- - :omics: list of (n_omics) ndarrays of shape (n_samples x n_features_omicj), values of - features from (omicj) measured on the same (n_samples) samples. + :omics: list of (n_omics) ndarrays of shape (n_samples x n_features_omicj), + values of features from (omicj) measured on the same + (n_samples) samples. :Y: ndarray of shape (n_samples x U), one-hot encoding of (y) indicating to which group each sample belongs. :W: ndarray of shape (n_samples x K), contributions of individuals to latent components. - :Hs: list of (n_omics) ndarrays of shape (K x n_features_omicj), latent components - built on (omicj). - :Betas: list of (n_omics) ndarrays of shape (K x 1), regression coefficients for projection - of individuals from (omicj) onto latent components from H^(j). + :Hs: list of (n_omics) ndarrays of shape (K x n_features_omicj), + latent components built on (omicj). + :Betas: list of (n_omics) ndarrays of shape (K x 1), regression + coefficients for projection of individuals from (omicj) + onto latent components from H^(j). :mu: float, value for parameter `mu` from objective function. :lambdA: float, value for parameter `lambda` from objective function. :gamma: float, value for parameter `gamma` from objective function. @@ -422,16 +425,18 @@ def _analytic_solver( Parameters ---------- - :omics: list of (n_omics) ndarrays of shape (n_samples x n_features_omicj), values of - features from (omicj) measured on the same (n_samples) samples. + :omics: list of (n_omics) ndarrays of shape (n_samples x n_features_omicj), + values of features from (omicj) measured on the same + (n_samples) samples. :Y: ndarray of shape (n_samples x U), one-hot encoding of (y) indicating to which group each sample belongs. :W: ndarray of shape (n_samples x K), contributions of individuals to latent components, initialized or at (t). - :Hs: list of (n_omics) ndarrays of shape (K x n_features_omicj), latent components - built on (omicj), initialized or at (t). - :Betas: list of (n_omics) ndarrays of shape (K x 1), regression coefficients for projection - of individuals from (omicj) onto H^(j),initialized or at (t). + :Hs: list of (n_omics) ndarrays of shape (K x n_features_omicj), + latent components built on (omicj), initialized or at (t). + :Betas: list of (n_omics) ndarrays of shape (K x 1), regression + coefficients for projection of individuals from (omicj) + onto H^(j),initialized or at (t). :params: dict of length 7 (optional), values for parameters `gamma`, `lambda`, `mu` from objective function, `eta` for prox, and `alpha`, `sigma`, `m_back` for linesearch(). @@ -443,7 +448,8 @@ def _analytic_solver( :backtrack: boolean, if Backtrack LineSearch is performed or not. :max_iter_back: int, maximum number of iterations for the backtrack. :Hlosses: list of (n_omics) lists, vsalues of the marginal loss for H^(j). - :Hgrads: list of (n_omics) lists, values of gradient in H^(j) (before update). + :Hgrads: list of (n_omics) lists, values of gradient in H^(j) + (before update). :K: int (optional), number of latent components to build. By default, K = 2. :K_gp: int, number of components dedicated to groups profiling. @@ -453,13 +459,15 @@ def _analytic_solver( Returns ------- :W_new: ndarray of shape (n_samples x K), updated, or (t+1), W matrix. - :Hs_new: list of (n_omics) ndarrays of shape (K x n_features_omicj), updated, or (t+1), - Hj matrices. - :Betas_new: list of (n_omics) vectors of length (K), updated, or (t+1), Betaj vectors. + :Hs_new: list of (n_omics) ndarrays of shape (K x n_features_omicj), + updated, or (t+1), Hj matrices. + :Betas_new: list of (n_omics) vectors of length (K), updated, or (t+1), + Betaj vectors. :current_etas: list of (n_omics) floats, updated value of parameter `etaj`. - :Hlosses: list of (n_omics) lists, augmented list of marginal losses for H^(j). - :Hgrads: list of (n_omics) lists, augmented list of gradient values in H^(j) - (before update). + :Hlosses: list of (n_omics) lists, augmented list of + marginal losses for H^(j). + :Hgrads: list of (n_omics) lists, augmented list of + gradient values in H^(j) (before update). Note @@ -591,10 +599,12 @@ class NMFProfiler: Parameters ---------- - :omics: list of (n_omics) array-like of shape (n_samples x n_features_omicj). + :omics: list of (n_omics) array-like of shape + (n_samples x n_features_omicj). Omics datasets. (n_samples) is the number of samples and (n_features_omicj) the number of features of (omicj). - WARNING: each (omicj) must contain the exact same samples in the same order. + WARNING: each (omicj) must contain the exact same samples + in the same order. :y: vector of length (n_samples). Group to which each sample belongs (same order than the rows of omicj). -- GitLab From 19001147d865fb40f978585128031b7cf3f6cce7 Mon Sep 17 00:00:00 2001 From: Aurelie Mercadie <aurelie.mercadie@inrae.fr> Date: Mon, 9 Dec 2024 15:17:22 +0100 Subject: [PATCH 18/19] added y levels transformation into integers (fixed #9) --- nmfprofiler/nmfprofiler.py | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/nmfprofiler/nmfprofiler.py b/nmfprofiler/nmfprofiler.py index 83a4ef5..ec87ed3 100644 --- a/nmfprofiler/nmfprofiler.py +++ b/nmfprofiler/nmfprofiler.py @@ -706,6 +706,10 @@ class NMFProfiler: Values of H^(j) gradients before being updated, at each iteration. + :df_y: `pd.DataFrame` of shape (n, 2) + Original levels in :y: as well as their associated + integer. + :runningtime: float. Running time of the method measured through `process_time()`. @@ -970,7 +974,22 @@ class NMFProfiler: # dedicated to groups, identically to K K_gp = len(np.unique(self.y)) - # Automatically convert y into a one-hot encoded matrix Y + # Turn y levels into integers + if self.verbose: + print("\nConverting y levels into integers...") + print("Original levels: ", np.unique(self.y)) + y_init = deepcopy(self.y) + num_levels = {} + idx = 0 + for level in np.unique(self.y): + if level not in num_levels: + num_levels[level] = idx + idx += 1 + self.y = np.array([num_levels[level] for level in self.y]) + if self.verbose: + print("New levels: ", np.unique(self.y), "\n") + + # And automatically convert y into a one-hot encoded matrix Y encoder = OneHotEncoder(handle_unknown="ignore") Y = encoder.fit_transform(self.y.reshape(-1, 1)).toarray() @@ -1310,6 +1329,14 @@ class NMFProfiler: columns=[f"eta_omic{j+1}" for j in range(len(self.omics))] ) + # Build the correspondance matrix between + # original levels and their associated integer in y + # ------------------------------------------------- + self.df_y = pd.DataFrame( + {"Original_y": y_init, + "Recoded_y": self.y} + ) + return self def predict(self, new_ind, verbose=False): -- GitLab From 20a91a141c72c2c375b38e66d14b48f384293357 Mon Sep 17 00:00:00 2001 From: Nathalie Vialaneix <nathalie.vialaneix@inrae.fr> Date: Wed, 18 Dec 2024 08:13:06 +0100 Subject: [PATCH 19/19] bumped version number and date --- codemeta.json | 4 ++-- docs/source/conf.py | 2 +- pyproject.toml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/codemeta.json b/codemeta.json index 030de6a..1efac7b 100644 --- a/codemeta.json +++ b/codemeta.json @@ -5,9 +5,9 @@ "codeRepository": "https://forgemia.inra.fr/omics-integration/nmfprofiler", "dateCreated": "2024-06-24", "datePublished": "2024-08-22", - "dateModified": "2024-08-23", + "dateModified": "2024-12-18", "name": "NMFProfiler", - "version": "0.2.1", + "version": "0.3.O", "description": "NMFProfiler: an integrative supervised Non-Negative Matrix Factorization to extract typical profiles of groups of interest combining two different datasets.", "applicationCategory": "Biostatistics", "funding": "2022/0051", diff --git a/docs/source/conf.py b/docs/source/conf.py index cead502..546b33a 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -21,7 +21,7 @@ sys.path.insert(0, os.path.abspath("../..")) project = "NMFProfiler" copyright = "2024, Aurélie Mercadié" author = "Aurélie Mercadié" -release = "0.2.1" +release = "0.3.0" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/pyproject.toml b/pyproject.toml index ae29d4e..7ed7b82 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ build-backend = "setuptools.build_meta" [project] name = "NMFProfiler" -version = "0.2.1" +version = "0.3.0" description = "NMFProfiler: an integrative supervised Non-Negative Matrix Factorization to extract typical profiles of groups of interest combining two different datasets." readme = "README.md" authors = [ -- GitLab