Merge pull request #39 from VasudhaJha/feature/export-all-files

Feature/export all files
xinglab-ai · Jul 24, 2023 · 7ee63fe · 7ee63fe
2 parents 28f75c3 + b984ca3
commit 7ee63fe
Show file tree

Hide file tree

Showing 10 changed files with 557 additions and 29 deletions.
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -14,5 +14,5 @@ jobs:
     steps:
       - uses: rymndhng/release-on-push-action@master
         with:
-          bump_version_scheme: patch # can be either "major", "minor", "patch" or "norelease"
+          bump_version_scheme: minor # can be either "major", "minor", "patch" or "norelease"
           tag_prefix: v
diff --git a/README.md b/README.md
@@ -171,12 +171,10 @@ plt.show()
 ```python
 import scanpy as sc
 import matplotlib.pyplot as plt
-import genomap.genoMOI as gp
 import scipy.io as sio
 import numpy as np
 import pandas as pd
-import umap
-
+from genomap.genoMOI import genoMOIvis, genoMOItraj
 
 # Load five different pancreatic datasets
 dx = sio.loadmat('dataBaronX.mat')
@@ -196,23 +194,79 @@ y = np.squeeze(dx['classLabel'])
 dx = sio.loadmat('batchLabel.mat')
 ybatch = np.squeeze(dx['batchLabel'])
 
-# Apply genoMOI with genomap size of 44x44 and dimension of 32 for the returned integrated data
-resVis=gp.genoMOI(data, data2, data3, data4, data5, colNum=44, rowNum=44, n_dim=32)
+# Apply genomap-based multi omic integration and visualize the integrated data with local structure for cluster analysis
+# returns 2D visualization, cluster labels, and intgerated data
+resVis,cli,int_data=genoMOIvis(data, data2, data3, data4, data5, colNum=12, rowNum=12, n_dim=32, epoch=10, prealign_method='scanorama')
 
-# Visualize the integrated data using UMAP
-embedding = umap.UMAP(n_neighbors=30,min_dist=0.3,n_epochs=200).fit_transform(resVis)
 
 plt.figure(figsize=(15, 10))
 plt.rcParams.update({'font.size': 28})    
-h1=plt.scatter(embedding[:, 0], embedding[:, 1], c=y,cmap='jet', marker='o', s=18)      #  ax = plt.subplot(3, n, i + 1*10+1)
-plt.xlabel('UMAP1')
-plt.ylabel('UMAP2')
+h1=plt.scatter(resVis[:, 0], resVis[:, 1], c=y,cmap='jet', marker='o', s=18)      #  ax = plt.subplot(3, n, i + 1*10+1)
+plt.xlabel('genoVis1')
+plt.ylabel('genoVis2')
+plt.tight_layout()
+plt.colorbar(h1)
+plt.show()
+
+plt.figure(figsize=(15, 10))
+plt.rcParams.update({'font.size': 28})    
+h1=plt.scatter(resVis[:, 0], resVis[:, 1], c=ybatch,cmap='jet', marker='o', s=18)      #  ax = plt.subplot(3, n, i + 1*10+1)
+plt.xlabel('genoVis1')
+plt.ylabel('genoVis2')
 plt.tight_layout()
 plt.colorbar(h1)
 plt.show()
 ```
 
-### Example 6 - Try genoSig for finding gene signatures for cell/data classes
+```python
+# Apply genomap-based multi omic integration and visualize the integrated data with global structure for trajectory analysis
+
+# returns 2D embedding, cluster labels, and intgerated data
+resTraj,cli,int_data=genoMOItraj(data, data2, data3, data4, data5, colNum=12, rowNum=12, n_dim=32, epoch=10, prealign_method='scanorama')
+
+
+plt.figure(figsize=(15, 10))
+plt.rcParams.update({'font.size': 28})    
+h1=plt.scatter(resTraj[:, 0], resTraj[:, 1], c=y,cmap='jet', marker='o', s=18)      #  ax = plt.subplot(3, n, i + 1*10+1)
+plt.xlabel('genoTraj1')
+plt.ylabel('genoTraj2')
+plt.tight_layout()
+plt.colorbar(h1)
+plt.show()
+
+plt.figure(figsize=(15, 10))
+plt.rcParams.update({'font.size': 28})    
+h1=plt.scatter(resTraj[:, 0], resTraj[:, 1], c=ybatch,cmap='jet', marker='o', s=18)      #  ax = plt.subplot(3, n, i + 1*10+1)
+plt.xlabel('genoTraj1')
+plt.ylabel('genoTraj2')
+plt.tight_layout()
+plt.colorbar(h1)
+plt.show()
+```
+
+### Example 6 - Try genoAnnotate for cell annotation
+
+```python
+import scanpy as sc
+import pandas as pd
+import genomap.genoAnnotate as gp
+#Load the PBMC dataset
+adata = sc.read_10x_mtx("pbmc3k_filtered_gene_bc_matrices/")
+
+# Input: adata: annData containing the raw gene counts
+# tissue type: e.g. Immune system,Pancreas,Liver,Eye,Kidney,Brain,Lung,Adrenal,Heart,Intestine,Muscle,Placenta,Spleen,Stomach,Thymus 
+
+adataP = gp.genoAnnotate(adata,tissue_type="Immune system")
+
+
+# Compute UMAP (requires neighborhood graph, see the previous code for Louvain clustering)
+sc.tl.umap(adataP)
+# Create a UMAP plot colored by cell type labels
+cell_annotations=adataP.obs['cell_type']
+sc.pl.umap(adataP, color='cell_type')
+```
+
+### Example 7 - Try genoSig for finding gene signatures for cell/data classes
 
 ```python
 import numpy as np
@@ -244,7 +298,7 @@ result=gp.genoSig(genoMaps,T,label,userPD,gene_namesRe, epochs=50)
 print(result.head())
 ```
 
-### Example 7 - Try genoClassification for tabular data classification
+### Example 8 - Try genoClassification for tabular data classification
 
 ```python
 import pandas as pd
@@ -281,8 +335,7 @@ est=gp.genoClassification(training_data, training_labels, test_data, rowNum=rowN
 print('Classification accuracy of genomap approach:'+str(np.sum(est==groundTruthTest) / est.shape[0]))  
 ```
 
-
-### Example 8 - Try genoRegression for tabular data regression
+### Example 9 - Try genoRegression for tabular data regression
 
 ```python
 import pandas as pd

diff --git a/data/readme.txt b/data/readme.txt
@@ -1,2 +1 @@
-Please download the data from https://drive.google.com/drive/u/3/folders/1QNJdPdXf1lfq0Mu5p5JrzMDhwJJCwgO7
- and put it in this folder (data/)
+Please download the data from https://drive.google.com/drive/folders/1xq3bBgVP0NCMD7bGTXit0qRkL8fbutZ6
diff --git a/genomap/genoAnnotate/__init__.py b/genomap/genoAnnotate/__init__.py
@@ -0,0 +1 @@
+from .genoAnnotate import *
diff --git a/genomap/genoAnnotate/genoAnnotate.py b/genomap/genoAnnotate/genoAnnotate.py
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Jul 23 15:18:43 2023
+
+@author: Md Tauhidul Islam
+# This code is inspired by scType (https://github.com/IanevskiAleksandr/sc-type)
+# We are in the process of using image matching techique for further enhancement
+# of the cell annotation
+"""
+
+from genomap.genotype import *
+import scanpy as sc
+
+def genoAnnotate(adata,tissue_type,database=None):
+    # Input: adata: annData containing the raw gene counts
+    # tissue type: e.g. Immune system,Pancreas,Liver,Eye,Kidney,Brain,Lung,Adrenal,Heart,Intestine,Muscle,Placenta,Spleen,Stomach,Thymus 
+    # database: User can select his/her own database in excel format
+
+    # Database file
+    if database==None:
+        database = "https://raw.githubusercontent.com/xinglab-ai/self-consistent-expression-recovery-machine/master/demo/data/genoANN_db.xlsx";        
+
+    # Filter cells
+    sc.pp.filter_cells(adata, min_genes=200)
+    # Normalize data
+    adata.raw = adata
+    sc.pp.normalize_total(adata, target_sum=10000)
+    sc.pp.log1p(adata)
+    sc.pp.highly_variable_genes(adata, n_top_genes=2000)
+    adata = adata[:, adata.var['highly_variable']]
+    # Scale data and run PCA
+    sc.pp.scale(adata, max_value=10)
+    sc.tl.pca(adata)
+
+    # Prepare positive and negative gene sets
+    result = gene_sets_prepare(database, tissue_type)
+    gs = result['gs_positive']
+    gs2 = result['gs_negative']
+    cell_types = result['cell_types']
+
+
+    data=adata.raw.X.toarray()
+    # Get cell-type by cell matrix
+    scRNAseqData = pd.DataFrame(data, index=adata.raw.obs_names, columns=adata.raw.var_names)
+
+    # Compute cell-type score fro each cell
+    es_max = sctype_score(scRNAseqData=scRNAseqData, scaled=True, gs=gs, gs2=gs2, cell_types=cell_types)
+    es_max.columns = cell_types
+    es_max.index = scRNAseqData.index
+
+    # Calculate neighborhood graph of cells (replace 'adata' with your actual AnnData object)
+    sc.pp.neighbors(adata, n_neighbors=10, use_rep='X_pca')
+    # Perform clustering so that cell-type can be assigned to each cluster
+    sc.tl.leiden(adata)
+    # The cluster labels are stored in `adata.obs['louvain']`
+    results = []
+    for cl in adata.obs['leiden'].unique():
+        cells_in_cluster = adata.obs_names[adata.obs['leiden'] == cl]
+        es_max_cl = es_max.loc[cells_in_cluster].sum().sort_values(ascending=False)
+        results.append(pd.DataFrame({
+            'cluster': cl,
+            'type': es_max_cl.index[:1],
+            'scores': es_max_cl.values[:1],
+            'ncells': len(cells_in_cluster)
+            }))
+
+    results = pd.concat(results)
+    results.loc[results['scores'] < results['ncells'] / 4, 'type'] = 'Unknown'
+    results.set_index('cluster', inplace=True)
+    # Assign the cell type labels to the cells in the AnnData object
+    adata.obs['cell_type'] = results.loc[adata.obs['leiden'], 'type'].values
+    return adata
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+