Source code for scSemiProfiler.utils

import pdb,sys,os
import timeit
import warnings
warnings.filterwarnings('ignore')
import argparse
import numpy as np
from sklearn.neural_network import MLPClassifier
import anndata
import scanpy as sc
import scipy
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sn
from scipy import sparse
from typing import Union
from mpmath import *
mp.dps = 200
import scipy.stats as stats
import gseapy
import copy
from typing import Tuple
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from .inference import *
import random
import faiss
from sklearn.decomposition import PCA



def get_cohort_sc(name:str, path:str, cohort_name:str, representative_ids:list) -> None:
    """
    Get representatives' single-cell data from a given preprocessed cohort (COVID-19, colorectal cancer, iMGL) and store the data as cohort_name.h5ad under the project's directory
    
    Parameters
    ----------
    name 
        Project name
    path 
        Path to the cohort dataset h5ad file
    cohort_name 
        The output single-cell data file name
    representative_ids 
        The sample IDs of the selected representatives
        
    Returns
    -------
    None

    Example
    -------
    >>> covidpath = '../preprocessed_data/preprocessed_covid-19_sc.h5ad'
    >>> name = 'semi-profile-covid-19'
    >>> cohort_name = 'covid-19'
    >>> representative_ids['MH9143273', 'BGCV09_CV0279', 'MH9143276', 'MH8919282']
    >>> get_cohort_sc(name=name, path=path, cohort_name= cohort_name, representative_ids=representative_ids)
    """
    
    scdata = anndata.read_h5ad(path)
    sids = []
    f = open(name + '/sids.txt', 'r')
    lines = f.readlines()
    for l in lines:
        sids.append(l.strip())
    f.close()
    
    rsids = representative_ids
    rmask=[]    
    for i in range(len(scdata.obs.index)):
        sid = scdata.obs['sample_ids'][i]
        if sid in rsids:
            rmask.append(True)
        else:
            rmask.append(False)
    rmask = np.array(rmask)
    
    repredata = scdata[rmask,:]
    
    X = repredata.X
    if X.max() < 20:
        if scipy.sparse.issparse(X):
            X = np.array(X.todense())
        X = np.exp(X) - 1
        X = sparse.csr_matrix(X)
    adata = anndata.AnnData(X)
    adata.obs = repredata.obs
    adata.var = repredata.var
    
    adata.write(name + '/' + cohort_name + '.h5ad')
    
    print('Obtained single-cell data for representatives from the cohort.')
    
    return






[docs]def estimate_cost(total_samples:int,n_representatives:int) -> Tuple[float,float]: """ Estimate the cost of semi-profiling and real-profiling. Parameters ---------- total_samples Total number of samples n_representatives: Number of representatives Returns ------- semicost Cost of semi-profiling realcost Cost of real-profiling Example ------- >>> semicost, realcost = estimate_cost(12,2) """ bulkcost = 7000 + total_samples * 110 sccost = 5000 * 0.3 * n_representatives semicost = bulkcost + sccost realcost = 5000 * 0.3 * total_samples pct = round((realcost - semicost)/realcost * 100,1) print() print('Estimated semi-profiling cost: $'+str(semicost)) print('Estimated cost if conducting real single-cell profiling: $'+str(realcost)) print('Percentage saved: ' + str(pct) + '%') return semicost,realcost
# visualizing reconstruction performance of a representative
[docs]def visualize_recon(name:str, representative:Union[int,str], save = None) -> None: """ Visualize the performance of reconstruction by plotting the original and reconstructed data in the same UMAP. Parameters ---------- name Project name representative Representative sample ID (string or int) save Path within the 'figures' folder to save the plot Returns ------- None Example ------- >>> name = 'project name' >>> visualize_recon(name, 6) """ sids = [] f = open(name+'/sids.txt','r') lines = f.readlines() for l in lines: sids.append(l.strip()) f.close() if type(representative) == type(1): sid = sids[representative] else: sid = representative repredata = anndata.read_h5ad(name + '/sample_sc/'+sid+'.h5ad') x0 = repredata.X genelen = x0.shape[1] device = 'cpu' k=15 batch_size = x0.shape[0] diagw=1 adata,adj,variances,bulk,geneset_len = setdata(name,sid,device,k,diagw) model = fastgenerator(variances,None,geneset_len,adata,n_hidden=256,n_latent=32,dropout_rate=0) path = name + '/models/fastreconst2_' + sid model.module.load_state_dict(torch.load(path)) x1 = [] scdl = model._make_data_loader( adata=adata,batch_size=batch_size ) for tensors in scdl: samples = model.module.sample(tensors, n_samples=1) x1.append(samples) # save inferred data x1 = np.array(torch.cat(x1))[:,:genelen] vdata = anndata.AnnData(np.concatenate([x0,x1],axis=0)) rc = [] for i in range(x0.shape[0]): rc.append('Representative') for i in range(x1.shape[0]): rc.append('Reconstructed') vdata.obs['reconstruction'] = rc sc.pp.log1p(vdata) sc.tl.pca(vdata) sc.pp.neighbors(vdata) sc.tl.umap(vdata) palette = {'Representative':'blue','Reconstructed':'gray'} sc.pl.umap(vdata,color='reconstruction',alpha=0.5,palette=palette,save=save)
# visualizing inference performance for a target sample
[docs]def visualize_inferred(name:str, target:int, representatives:list, cluster_labels:list, realdata_path:str = 'example_data/scdata.h5ad', n_comps:int=100, save = None) -> None: """ Visualize the inference performance by plotting the representative, inferred target, and target ground truth in the same UMAP. Parameters ---------- name Project name target: Target sanmple ID (number) representatives: Representatives sample IDs (int) cluster_labels: Cluster labels realdata_path: Path to the real-profiled single-cell data n_comps: The number of PCs in PCA save Path within the 'figures' folder to save the plot Returns ------- None Example ------- >>> name = 'project name' >>> >>> # load representatives and cluster labels lists >>> repres = [] >>> f=open(name + '/status/init_representatives.txt','r') >>> lines = f.readlines() >>> f.close() >>> for l in lines: >>> repres.append(int(l.strip())) >>> >>> cl = [] >>> f=open(name + '/status/init_cluster_labels.txt','r') >>> lines = f.readlines() >>> f.close() >>> for l in lines: >>> cl.append(int(l.strip())) >>> >>> visualize_inferred(name, 0, repres, cl) """ sids = [] f = open(name+'/sids.txt','r') lines = f.readlines() for l in lines: sids.append(l.strip()) f.close() representative = representatives[cluster_labels[target]] repredata = anndata.read_h5ad(name + '/sample_sc/'+sids[representative]+'.h5ad') x0 = repredata.X genelen = x0.shape[1] xsem = np.load(name + '/inferreddata/' + sids[representative] + '_to_' + sids[target]+'.npy' ) x1 = xsem[:,:genelen] alldata = anndata.read_h5ad(realdata_path) tgtdata = alldata[alldata.obs['sample_ids']==sids[target]] x2 = tgtdata.X if scipy.sparse.issparse(x2): x2 = x2.todense() x2 = np.array(x2) if x2.max()<20: x2 = np.exp(x2)-1 vdata = anndata.AnnData(np.concatenate([x0,x1,x2],axis=0)) rc = [] for i in range(x0.shape[0]): rc.append('Representative') for i in range(x1.shape[0]): rc.append('Target inferred') for i in range(x2.shape[0]): rc.append('Target ground truth') vdata.obs['inference'] = rc sc.pp.log1p(vdata) sc.tl.pca(vdata,n_comps=n_comps) sc.pp.neighbors(vdata) sc.tl.umap(vdata) palette = {'Representative':'blue','Target inferred':'yellow','Target ground truth':'red'} sc.pl.umap(vdata,color='inference',alpha=0.5,palette=palette,save=save) return
[docs]def loss_curve(name:str, reprepid:int=None,tgtpid:int=None,stage:int=1, save = None)->None: """ Visualize the training loss curves Parameters ---------- name Project name reprepid: Representative sanmple ID tgtpid: target sample IDs stage: The training stage to visualize, 1: pretrain1; 2: pretrain2; 3: inference save Path within the 'figures' folder to save the plot Returns ------- None Example ------- >>> name = 'project name' >>> loss_curve(name, reprepid='BGCV09_CV0279',tgtpid=None,stage=1) # or loss_curve(name, sids, reprepid=6,tgtpid=None,stage=1) """ sids = [] f = open(name+'/sids.txt','r') lines = f.readlines() for l in lines: sids.append(l.strip()) f.close() if type(reprepid) == type(1): reprepid = sids[reprepid] if type(tgtpid) != type(None): if type(tgtpid) == type(1): tgtpid = sids[tgtpid] if (stage == 1) or (stage == 'pretrain1') or (stage == 'pretrain 1'): fname = name+'/history/pretrain1_' + reprepid + '.pkl' with open(fname, 'rb') as pickle_file: history = pickle.load(pickle_file) plt.plot(history['train_loss_epoch']) plt.title('Pretrain 1 Loss Curve') elif (stage == 2) or (stage == 'pretrain2') or (stage == 'pretrain 2'): fname = name+'/history/pretrain2_' + reprepid + '.pkl' with open(fname, 'rb') as pickle_file: history = pickle.load(pickle_file) plt.plot(history['train_loss_epoch']) plt.title('Pretrain 2 Loss Curve') elif (stage == 3) or (stage == 'inference'): fname = name+'/history/inference_' + reprepid + '_to_' + tgtpid + '.pkl' with open(fname, 'rb') as pickle_file: history = pickle.load(pickle_file) fig, axs = plt.subplots(5,1,figsize=(2,8)) axs[0].plot(history['bulk0']) axs[1].plot(history['bulk1']) axs[2].plot(history['bulk2']) axs[3].plot(history['bulk3']) axs[4].plot(history['bulk4']) plt.title('Inference Loss') if save!= None: plt.savefig(name + 'figures/'+save,dpi=600,bbox_inches='tight') return
[docs]def assemble_cohort(name:str, representatives:Union[list,str], cluster_labels:Union[list,str], celltype_key:str = 'celltypes', sample_info_keys:list = ['states_collection_sum'], bulkpath:str = 'example_data/bulkdata.h5ad', annotator = MLPClassifier(hidden_layer_sizes=(200,))): """ Assemble inferred sample data and representative sample data into semi-profiled cohort and annotate the celltype. Parameters ---------- name: Project name representatives: Either a list of representatives or path to a txt file specifying the representative information cluster_labels: Either a list of sample cluster labels or path to a txt file specifying the sample cluster label information celltype_key: The key in .obs specifying the cell type information sample_info_keys: Keys for other sample-level information to be stored in the assembled dataset bulkpath: Path to the bulk data Returns ------- semidata: The assembled and annotated semi-profiled dataset Example ------- >>> semisdata = assemble_cohort(name, >>> repre, >>> cls, >>> celltype_key = 'celltypes', >>> sample_info_keys = ['states_collection_sum'], >>> bulkpath = 'example_data/bulkdata.h5ad') """ print('Start assembling semi-profiled cohort.') # read sample ids sids = [] f = open(name + '/sids.txt', 'r') lines = f.readlines() for l in lines: sids.append(l.strip()) f.close() # read representatives and clustering info if type(representatives) == type(''): rs = [] f = open(representatives, 'r') lines = f.readlines() for l in lines: rs.append(int(l.strip())) f.close() representatives = rs if type(cluster_labels) == type(''): cl = [] f = open(cluster_labels, 'r') lines = f.readlines() for l in lines: cl.append(int(l.strip())) f.close() cluster_labels = cl ### read expression data # read representative single-cell h5ad xtrain = [] ytrain = [] for i in range(len(representatives)): sid = sids[representatives[i]] adata = anndata.read_h5ad(name + '/sample_sc/' + sid + '.h5ad') xtrain.append(np.array(adata.X)) sample_celltype = list(adata.obs[celltype_key]) ytrain = ytrain + sample_celltype xtrain = np.concatenate(xtrain, axis=0) if xtrain.max()>10: xtrain = np.log1p(xtrain) ytrain = np.array(ytrain) # train annotator print('Training cell type annotator.') st = timeit.default_timer() #annotator = MLPClassifier(hidden_layer_sizes=(200,)) annotator.fit((xtrain),ytrain) ed = timeit.default_timer() print('Finished. Cost ' + str(ed-st) + ' seconds.') # assemble cohort print('Generating semi-profiled cohort data.') X = [] # expression matrix source = [] # inferred or representative semicelltype = [] # cell type semiids = [] # sample ids sampleinfo = {} #sample information to be preserved for k in sample_info_keys: # other sample information to be preserved sampleinfo[k] = [] # read expression data for i in range(len(sids)): if i in representatives: sid = sids[i] adata = anndata.read_h5ad(name + '/sample_sc/' + sid + '.h5ad') # expression X.append(np.array(adata.X)) # essential keys semicelltype = semicelltype + list(adata.obs[celltype_key]) #semiids = semiids + list(adata.obs['sample_ids']) for j in range(adata.X.shape[0]): semiids.append(sids[i]) source.append('Representative') ## other keys for k in sample_info_keys: sampleinfo[k] = sampleinfo[k] + list(adata.obs[k]) else: # inferred target sample # expression x = np.load(name + '/inferreddata/' + \ sids[representatives[cluster_labels[i]]] + \ '_to_' + sids[i] + '.npy') X.append(np.array(x)) # essential keys predicted_celltype = annotator.predict(np.log1p(x)) semicelltype = semicelltype + list(predicted_celltype) for j in range(x.shape[0]): semiids.append(sids[i]) source.append('Inferred') ## other keys bulkdata = anndata.read_h5ad(bulkpath) for k in sample_info_keys: for j in range(x.shape[0]): sampleinfo[k].append(bulkdata.obs[k][i]) semidata = anndata.AnnData(np.array(np.concatenate(X))) semidata.obs['source'] = source semidata.obs[celltype_key] = semicelltype semidata.obs['sample_ids'] = semiids semidata.var = adata.var for k in sample_info_keys: semidata.obs[k] = sampleinfo[k] semidata.write(name + '/semidata.h5ad') print('Finished assembling semi-profiled cohort. Output as semidata.h5ad') return semidata
[docs]def assemble_representatives(name:str,celltype_key:str='celltypes',sample_info_keys:list = ['states_collection_sum'],rnd:int=2,batch:int=2,gtsc:str = 'example_data/scdata.h5ad', gtbulk:str = 'example_data/bulkdata.h5ad') -> Tuple[anndata.AnnData, anndata.AnnData]: """ Assemble previous round of inferred representative data and annotate the cell type. The real-profiled representatives in the current round is also provided for comparison. Parameters ---------- name: Project name celltype_key: The key in .obs specifying the cell type information sample_info_keys: Keys for other sample-level information to be stored in the assembled dataset rnd: The round of semi-profiling to assemble. For example, select the second round (2 batches of representatives) using rnd = 2 batch: The representative selection batch size gtsc: Path to ground truth data gtbulk: Path to bulk data Returns ------- realrepdata: The real-profiled representative dataset infrepdata: The inferred representative dataset Example ------- >>> real_rep, inferred_rep = assemble_representatives(name,celltype_key='celltypes',sample_info_keys = ['states_collection_sum'],rnd=2,batch=2) """ sids=[] f = open(name+'/sids.txt','r') lines=f.readlines() for l in lines: sids.append(l.strip()) f.close() reps = [] f=open(name + '/status/eer_representatives_' + str(rnd) + '.txt','r') lines = f.readlines() for l in lines: reps.append(int(l.strip())) f.close() newcl=[] f=open(name + '/status/eer_cluster_labels_' + str(rnd) + '.txt','r') lines = f.readlines() for l in lines: newcl.append(int(l.strip())) f.close() newreps = reps[-batch:] alldata = anndata.read_h5ad(gtsc) genelen = len(alldata.var.index) newrepsids = [] for i in newreps: newrepsids.append(sids[i]) repmask =[] for i in range(alldata.X.shape[0]): if alldata.obs['sample_ids'][i] in newrepsids: repmask.append(True) else: repmask.append(False) repmask = np.array(repmask) realrepdata = alldata[repmask] ### load semi profiled data oldreps = reps[:-batch] if rnd==2: clname = name + '/status/init_cluster_labels.txt' else: clname = name + '/status/eer_cluster_labels_'+str(rnd-1)+'.txt' oldcl = [] f = open(clname, 'r') lines = f.readlines() for l in lines: oldcl.append(int(l.strip())) f.close() ### train annotator using the previous representatives xtrain = [] ytrain = [] for i in range(len(oldreps)): sid = sids[oldreps[i]] adata = anndata.read_h5ad(name + '/sample_sc/' + sid + '.h5ad') x = np.array(adata.X) if x.max() > 20: x = np.log1p(x) xtrain.append(x) sample_celltype = list(adata.obs[celltype_key]) ytrain = ytrain + sample_celltype xtrain = np.concatenate(xtrain, axis=0) ytrain = np.array(ytrain) # train annotator print('Training cell type annotator.') st = timeit.default_timer() annotator = MLPClassifier() #hidden_layer_sizes=(200,)) annotator.fit((xtrain),ytrain) ed = timeit.default_timer() print('Finished. Cost ' + str(ed-st) + ' seconds.') semicelltype = [] # cell type semiids = [] # sample ids sampleinfo = {} #sample information to be preserved for k in sample_info_keys: # other sample information to be preserved sampleinfo[k] = [] xinf = [] for i in range(batch): # expression sid = sids[newreps[i]] repsid = sids[oldreps[oldcl[newreps[i]]]] x = np.load(name + '/inferreddata/' + repsid + '_to_' + sid + '.npy') xinf.append(x) # essential keys if x.max()>20: x = np.log1p(x) predicted_celltype = annotator.predict(x) semicelltype = semicelltype + list(predicted_celltype) for j in range(x.shape[0]): semiids.append(sids[i]) ## other keys bulkdata = anndata.read_h5ad(gtbulk) for k in sample_info_keys: for j in range(x.shape[0]): sampleinfo[k].append(bulkdata.obs[k][i]) xinf = np.concatenate(xinf, axis=0) xinf = xinf[:,:genelen] infrepdata = anndata.AnnData(np.array(xinf)) infrepdata.obs[celltype_key] = semicelltype infrepdata.obs['sample_ids'] = semiids infrepdata.var = bulkdata.var for k in sample_info_keys: infrepdata.obs[k] = sampleinfo[k] infrepdata.write(name + '/infrepdata_' + str(rnd) + '.h5ad') realrepdata.write(name + '/realrepdata_' + str(rnd) + '.h5ad') return realrepdata, infrepdata
[docs]def compare_umaps( semidata:anndata.AnnData, name:str = 'testexample', representatives:str = 'testexample/status/init_representatives.txt', cluster_labels:str = 'testexample/status/init_cluster_labels.txt', celltype_key:str = 'celltypes', save = None ) -> Tuple[anndata.AnnData, anndata.AnnData, anndata.AnnData]: """ Compare the real-profiled and semi-profiled datasets by plotting them in a same UMAP Parameters ---------- semidata: Semi-profiled dataset name: Project name representatives: Path to the txt file storing the representative information cluster_labels: Path to the txt file storing the cluster label information celltype_key: The key in .obs specifying the cell type information save Path within the 'figures' folder to save the plot Returns ------- combdata Combined dataset, with real-profiled cells in the front gtdata Real-profiled dataset semidata Semi-profiled dataset Example ------- >>> combined_data,gtdata,semidata = compare_umaps( >>> semidata = semisdata, # assembled semi-profiled dataset >>> name = name, >>> representatives = name + '/status/init_representatives.txt', >>> cluster_labels = name + '/status/init_cluster_labels.txt', >>> celltype_key = 'celltypes' >>> ) """ # visualize UMAPs of real-profiled and semi-profiled data for comparison # return both datasets with PCA and UMAP coordinates added gtdata = anndata.read_h5ad('example_data/scdata.h5ad') x0 = np.array(gtdata.X.todense()) x1 = np.array(semidata.X) x1 = np.log1p(x1) combdata = anndata.AnnData(np.concatenate([x0,x1],axis=0)) combdata.var = gtdata.var combdata.obs[celltype_key] = list(gtdata.obs[celltype_key]) + list(semidata.obs[celltype_key]) cohort = [] for i in range(gtdata.X.shape[0]): cohort.append('real-profiled') for i in range(semidata.X.shape[0]): cohort.append('semi-profiled') combdata.obs['cohort'] = cohort sc.pp.log1p(combdata) sc.tl.pca(combdata,n_comps=100) sc.pp.neighbors(combdata,n_neighbors=50) sc.tl.umap(combdata) if save!=None: savecomb = save + 'real_vs_semi' savereal = save + 'real' savesemi = save + 'semi' else: savecomb = None savereal = None savesemi = None sc.pl.umap(combdata,color = 'cohort', title = 'Real-profiled VS Semi-profiled', save = savecomb) semidata.obsm['X_umap'] = combdata.obsm['X_umap'][gtdata.X.shape[0]:] gtdata.obsm['X_umap'] = combdata.obsm['X_umap'][:gtdata.X.shape[0]] sc.pl.umap(gtdata,color = 'celltypes',title = 'Real-profiled', save = savereal) sc.pl.umap(semidata,color = 'celltypes',title = 'Semi-profiled', save = savesemi) return combdata,gtdata,semidata
[docs]def compare_adata_umaps( semidata:anndata.AnnData, gtdata:anndata.AnnData, name:str = 'testexample', celltype_key:str = 'celltypes', save = None ) -> Tuple[anndata.AnnData, anndata.AnnData, anndata.AnnData]: """ Compare the real-profiled and semi-profiled datasets by plotting them in a same UMAP Parameters ---------- semidata: Semi-profiled dataset gtdata: Real-profiled dataset name: Project name celltype_key: The key in .obs specifying the cell type information save Path within the 'figures' folder to save the plot Returns ------- combdata Combined dataset, with real-profiled cells in the front gtdata Real-profiled dataset semidata Semi-profiled dataset Example ------- >>> combdata, gtdata, semidata = compare_adata_umaps( >>> inferred_rep, # inferred representatives from the last round >>> real_rep, # real-profiled representatives from the current round >>> name = name, # project name >>> celltype_key = 'celltypes' >>> ) """ x0 = gtdata.X if type(x0) != np.ndarray:#== scipy.sparse.csr_matrix: x0 = x0.todense() x1 = semidata.X if type(x1) != np.ndarray:#scipy.sparse.csr_matrix: x1 = x1.todense() x0 = np.array(x0) x1 = np.array(x1) if x1.max()>20: x1 = np.log1p(x1) if x0.max()>20: x0 = np.log1p(x0) combdata = anndata.AnnData(np.concatenate([x0,x1],axis=0)) combdata.var = gtdata.var combdata.obs[celltype_key] = list(gtdata.obs[celltype_key]) + list(semidata.obs[celltype_key]) cohort = [] for i in range(gtdata.X.shape[0]): cohort.append('real-profiled') for i in range(semidata.X.shape[0]): cohort.append('semi-profiled') combdata.obs['cohort'] = cohort #sc.pp.log1p(combdata) sc.tl.pca(combdata,n_comps=100) sc.pp.neighbors(combdata,n_neighbors=50) sc.tl.umap(combdata) sc.pl.umap(combdata,color = 'cohort', title = 'Real-profiled VS Semi-profiled') semidata.obsm['X_umap'] = combdata.obsm['X_umap'][gtdata.X.shape[0]:] gtdata.obsm['X_umap'] = combdata.obsm['X_umap'][:gtdata.X.shape[0]] if save!=None: savereal = save+'read' savesemi = save+'semi' else: savereal = None savesemi = None sc.pl.umap(gtdata,color = celltype_key,title = 'Real-profiled',save=savereal) sc.pl.umap(semidata,color = celltype_key,title = 'Semi-profiled',save=savesemi) return combdata,gtdata,semidata
[docs]def celltype_proportion(adata:anndata.AnnData,totaltypes:Union[np.array,list]) -> np.array: """ Compute the cell type proportion in a dataset Parameters ---------- adata: The dataset to investigate totaltypes: The total cell types to consider Returns ------- prop Cell type proportion Example ------- >>> real_prop = celltype_proportion(real_rep,totaltypes) """ prop = np.zeros(len(totaltypes)) for i in range(len(totaltypes)): prop[i] += (adata.obs['celltypes'] == totaltypes[i]).sum() # norm prop = prop/prop.sum() return prop
[docs]def composition_by_group( adata:anndata.AnnData, colormap:Union[str,list] = None, groupby:str = None, title:str = 'Cell type composition' , save = None ) -> None: """ Visualizing the cell type composition in each group. Parameters ---------- adata: The dataset to investigate colormap: The colormap for visualization groupby: The key in .obs specifying groups. title: Plot title save Path within the 'figures' folder to save the plot Returns ------- None Example ------- >>> groupby = 'states_collection_sum' >>> composition_by_group( >>> adata = gtdata, >>> groupby = groupby, >>> title = 'Ground truth' >>> ) """ totaltypes = np.array(adata.obs['celltypes'].cat.categories) if colormap == None: colormap = adata.uns['celltypes_colors'] conditions = np.unique(adata.obs[groupby]) n = conditions.shape[0] percentages = [] for i in range(conditions.shape[0]): condition_prop = celltype_proportion(adata[adata.obs[groupby] == conditions[i]],totaltypes) percentages.append(condition_prop) fig, axs = plt.subplots(n,1,figsize=(n,1)) axs[0].set_title(title) for j in range(n): for i in range(len(totaltypes)): axs[j].barh(conditions[j],percentages[j][i], left = sum(percentages[j][:i]),color = colormap[i]) axs[j].set_xlim([0, 1]) axs[j].set_yticklabels([]) axs[j].yaxis.set_tick_params(left = False, right = False , labelleft = False , labelbottom = False, bottom = False) if j != n: axs[j].set_xticklabels([]) axs[j].text(-0.01, 0, conditions[j], ha='right', va='center') patches = [] for i in range(len(totaltypes)): patches.append(mpatches.Patch(color=colormap[i], label=totaltypes[i])) axs[-1].legend(handles=patches, loc='center left', bbox_to_anchor=(1.1, n)) plt.xlabel('Proportion') if save != None: plt.savefig(name + 'figures/'+save,dpi=600,bbox_inches='tight') return
[docs]def geneset_pattern( adata: anndata.AnnData, genes: list, condition_key: str, celltype_key: str, totaltypes: Union[np.array,list] = None, baseline:str = None, save = None ) -> np.array: """ Generate heatmaps for visualizing gene set activation pattern in a dataset. Parameters ---------- adata: The dataset to investigate genes: The list of genes in the gene set condition_key: The key in .obs specifying different sample conditions. celltype_key: The key in .obs specifying cell type information totaltypes: Total cell type names baseline: Baseline condition save Path within the 'figures' folder to save the plot Returns ------- pattern np.array Example ------- >>> gtmtx = geneset_pattern(gtdata,IFN_genes,'states_collection_sum','celltypes') """ sc.tl.score_genes(adata, genes, ctrl_size=50, gene_pool=None, n_bins=25, score_name='geneset', random_state=0, copy=False, use_raw=None) conditions = np.unique(adata.obs[condition_key]) if type(totaltypes) == type(None): totaltypes = np.unique(adata.obs[celltype_key]) scores = adata.obs['geneset'] pattern = np.zeros((len(conditions),len(totaltypes))) for i in range(len(conditions)): condition = conditions[i] condition_mask = (adata.obs[condition_key] == condition) for j in range(len(totaltypes)): celltype = totaltypes[j] celltype_mask = (adata.obs[celltype_key] == celltype) relevantscores = scores[np.logical_and(condition_mask, celltype_mask)] pattern[i,j] = relevantscores.mean() sn.heatmap(pattern,xticklabels = totaltypes,yticklabels=conditions,cmap="coolwarm",square = True) if (save != None) and (save!=False): plt.savefig(save) return pattern
# dot plot
[docs]def celltype_signature_comparison(gtdata:anndata.AnnData,semisdata:anndata.AnnData,celltype_key:str, top_n_degs:int = 3, save = None) -> Tuple[float, float, list]: """ Use dotplot to compare the cell type signatures found using the real-profiled dataset and the semi-profiled datset. Parameters ---------- gtdata: The real-profiled dataset semisdata: The semi-profiled dataset celltype_key: The key in .obs specifying the cell type labels save Path within the 'figures' folder to save the plot top_n_degs Top n DEGs used for each cell type in the dotplot Returns ------- Tuple[float, float, List]: A tuple containing two floats and a list: Pearson correlation of dot sizes, dot colors, and a list of signature genes. Example ------- >>> celltype_signature_comparison(gtdata=gtdata,semisdata=semisdata,celltype_key='celltypes') """ totaltypes = np.unique(gtdata.obs[celltype_key]) sc.tl.rank_genes_groups(gtdata, celltype_key, method='t-test') signatures = [] for j in range(totaltypes.shape[0]): typede = [] for i in range(top_n_degs): g = gtdata.uns['rank_genes_groups']['names'][i][j] typede.append(g) signatures = signatures + typede if save!=None: savegt = save + 'gt' savesemi = save + 'semi' else: savegt = None savesemi = None sc.pl.dotplot(gtdata, signatures, groupby = celltype_key, save = savegt) sc.pl.dotplot(semisdata, signatures, groupby = celltype_key,save=savesemi) dpgt = sc.pl.dotplot(gtdata, signatures, groupby=celltype_key,return_fig=True) dpgtcolor = (dpgt.dot_color_df.to_numpy()) dpgtsize = (dpgt.dot_size_df.to_numpy()) dpsemi = sc.pl.dotplot(semisdata, signatures, groupby=celltype_key,return_fig=True) dpsemicolor = (dpsemi.dot_color_df.to_numpy()) dpsemisize = (dpsemi.dot_size_df.to_numpy()) print('Expression fraction (size) similarity between real and semi-profiled:') print(scipy.stats.pearsonr(dpgtsize.flatten(), dpsemisize.flatten())) print('Expression intensity (color) similarity between real and semi-profiled') print(scipy.stats.pearsonr(dpgtcolor.flatten(), dpsemicolor.flatten())) return scipy.stats.pearsonr(dpgtsize.flatten(), dpsemisize.flatten())[0], scipy.stats.pearsonr(dpgtcolor.flatten(), dpsemicolor.flatten())[0], signatures
### statistics utils and rrho
[docs]def comb(a:int,b:int) -> mp.mpf: """ Combination number Parameters ---------- a: The total number of choice b: The number of elements to choose. Returns ------- cad Choose a from b Example ------- >>> print(comb(5,3)) """ a=mp.mpf(a) b=mp.mpf(b) cab = fac(a)/fac(a-b)/fac(b) return cab
[docs]def hyperp(N:int,n1:int,n2:int,k:int) -> mp.mpf: """ Returns the pmf of a hypergeometric test. Parameters ---------- N: Population size. In our case this is the total number of gene n1: The number of element in the first set n2: The number of element in the second set k: The number of overlap Returns ------- p cdf Example ------- >>> print(hyperp(6000,100,100,97)) """ #cdf N = mp.mpf(N) n1 = mp.mpf(n1) n2 = mp.mpf(n2) k = mp.mpf(k) p = comb(n2,k)*comb(N-n2,n1-k)/comb(N,n1) return p
[docs]def hypert(N:int,n1:int,n2:int,k:int) -> mp.mpf: """ Returns the p-value of a hypergeometric test. Parameters ---------- N: Population size. In our case this is the total number of gene n1: The number of element in the first set n2: The number of element in the second set k: The number of overlap Returns ------- pval p-value Example ------- >>> print(hypert(6000,100,100,97)) """ cdf = mp.mpf(0) for i in range(0,int(k)+1): cdf += hyperp(N,n1,n2,i) # print() return (1-cdf)
def rrho_plot(name:str, list1:list, list2:list, list3:list, list4:list, celltype:str, population:int ,upperbound:int=50, save = None) -> Tuple[np.array,np.array,np.array,np.array]: """ Generates data for RRHO graph used to compare the positive and negative markers found using real-profiled and semi-profiled datasets. Parameters ---------- name project name list1: Positive markers found using the real-profiled dataset list2: Positive markers found using the semi-profiled dataset list3: Negative markers found using the real-profiled dataset list4: Negative markers fuond using the semi-profiled dataset celltype: The selected cell type to analyze population: The population size in hypergeometric test for evaluating the overlap between two gene lists. In our case this will be to total number of genes used. upperbound: The upperbound for negative log p-value for visualization save Path within the 'figures' folder to save the plot Returns ------- rrho_matrix1 Values for the first quadrant rrho_matrix2 Values for the second quadrant rrho_matrix3 Values for the third quadrant rrho_matrix4 Values for the forth quadrant """ n1 = len(list1) # gt pos n2 = len(list2) # semi pos n3 = len(list3) # gt neg n4 = len(list4) # semi neg # Calculate the maximum rank to consider max_rank1 = max(n3, n2) max_rank2 = max(n3, n4) max_rank3 = max(n1, n2) max_rank4 = max(n1, n4) # Initialize the RRHO plot matrix rrho_matrix1 = np.zeros((max_rank1, max_rank1)) rrho_matrix2 = np.zeros((max_rank2, max_rank2)) rrho_matrix3 = np.zeros((max_rank3, max_rank3)) rrho_matrix4 = np.zeros((max_rank4, max_rank4)) # Iterate over different cutoff ranks upper=1e-50 ## ax1 for rank1 in range(1, max_rank1 + 1): for rank2 in range(1, max_rank1 + 1): # Get the top genes up to the cutoff ranks top_genes1 = set(list1[:rank1]) top_genes2 = set(list4[:rank2]) union = np.unique(list(top_genes1) + list(top_genes2)) # Calculate the overlap between the two gene sets overlap = len(top_genes1.intersection(top_genes2)) # Calculate the hypergeometric p-value for the overlap p_value = hypert(population,rank1,rank2,overlap)#1 - scipy.stats.hypergeom.cdf(overlap, 6000, rank1, rank2) #print(p_value) p_value = float(p_value) if p_value < upper: p_value = upper if np.isnan(p_value): print(overlap, len(union), rank1, rank2) # Store the negative logarithm of the p-value in the RRHO matrix rrho_matrix1[rank1 - 1, rank2 - 1] = -np.log10(p_value) #ax2 for rank1 in range(1, max_rank2 + 1): for rank2 in range(1, max_rank2 + 1): # Get the top genes up to the cutoff ranks top_genes1 = set(list1[:rank1]) top_genes2 = set(list2[:rank2]) union = np.unique(list(top_genes1) + list(top_genes2)) # Calculate the overlap between the two gene sets overlap = len(top_genes1.intersection(top_genes2)) # Calculate the hypergeometric p-value for the overlap p_value = hypert(population,rank1,rank2,overlap)#1 - scipy.stats.hypergeom.cdf(overlap, 6000, rank1, rank2) #print(p_value) p_value = float(p_value) if p_value < upper: p_value = upper if np.isnan(p_value): print(overlap, len(union), rank1, rank2) # Store the negative logarithm of the p-value in the RRHO matrix rrho_matrix2[rank1 - 1, rank2 - 1] = -np.log10(p_value) for rank1 in range(1, max_rank3 + 1): for rank2 in range(1, max_rank3 + 1): # Get the top genes up to the cutoff ranks top_genes1 = set(list3[:rank1]) top_genes2 = set(list4[:rank2]) union = np.unique(list(top_genes1) + list(top_genes2)) # Calculate the overlap between the two gene sets overlap = len(top_genes1.intersection(top_genes2)) # Calculate the hypergeometric p-value for the overlap p_value = hypert(population,rank1,rank2,overlap)#1 - scipy.stats.hypergeom.cdf(overlap, 6000, rank1, rank2) #print(p_value) p_value = float(p_value) if p_value < upper: p_value = upper if np.isnan(p_value): print(overlap, len(union), rank1, rank2) # Store the negative logarithm of the p-value in the RRHO matrix rrho_matrix3[rank1 - 1, rank2 - 1] = -np.log10(p_value) for rank1 in range(1, max_rank4 + 1): for rank2 in range(1, max_rank4 + 1): # Get the top genes up to the cutoff ranks top_genes1 = set(list3[:rank1]) top_genes2 = set(list2[:rank2]) union = np.unique(list(top_genes1) + list(top_genes2)) # Calculate the overlap between the two gene sets overlap = len(top_genes1.intersection(top_genes2)) # Calculate the hypergeometric p-value for the overlap p_value = hypert(population,rank1,rank2,overlap)#1 - scipy.stats.hypergeom.cdf(overlap, 6000, rank1, rank2) #print(p_value) p_value = float(p_value) if p_value < upper: p_value = upper if np.isnan(p_value): print(overlap, len(union), rank1, rank2) # Store the negative logarithm of the p-value in the RRHO matrix rrho_matrix4[rank1 - 1, rank2 - 1] = -np.log10(p_value) fig, axs = plt.subplots(2, 2,figsize=(6,6)) cmap = 'magma' rrho_matrix2 = np.flip(rrho_matrix2,axis=1) rrho_matrix3 = np.flip(rrho_matrix3,axis=0) rrho_matrix4 = np.flip(np.flip(rrho_matrix4,axis=0),axis=1) vmx = 50 # Plot heatmap in the first quadrant im1 = axs[0, 0].imshow(rrho_matrix1, cmap=cmap,vmax=vmx, aspect='auto') axs[0, 0].set_xticks([]) axs[0, 0].set_yticks([]) # Plot heatmap in the second quadrant im2 = axs[0, 1].imshow(rrho_matrix2, cmap=cmap,vmax=vmx, aspect='auto') axs[0, 1].set_xticks([]) axs[0, 1].set_yticks([]) # Plot heatmap in the third quadrant im3 = axs[1, 0].imshow(rrho_matrix3, cmap=cmap,vmax=vmx, aspect='auto') axs[1, 0].set_xticks([]) axs[1, 0].set_yticks([]) # Plot heatmap in the fourth quadrant im4 = axs[1, 1].imshow(rrho_matrix4, cmap=cmap,vmax=vmx, aspect='auto') axs[1, 1].set_xticks([]) axs[1, 1].set_yticks([]) cbar_ax = fig.add_axes([1, 0.025, 0.04, 0.95]) cbar = fig.colorbar(im4, ax=axs[1, 1], fraction=0.046 * 2, pad=0.04,cax=cbar_ax,label='-log10(p)') plt.tight_layout() if save!= None: plt.savefig(name + '/figures/'+save,dpi=600,bbox_inches='tight') plt.show() return rrho_matrix1,rrho_matrix2,rrho_matrix3,rrho_matrix4
[docs]def rrho(name,gtdata:anndata.AnnData,semisdata:anndata.AnnData,celltype_key:str,celltype:str, save = None) -> Tuple[np.array,np.array,np.array,np.array]: """ Use RRHO graph to compare the positive and negative markers found using real-profiled and semi-profiled datasets. Parameters ---------- name Project name gtdata: Real-profiled (ground truth) data semisdata: Semi-profiled dataset celltype_key: The key in anndata.AnnData.obs for storing the cell type information celltype: The selected cell type to analyze save Path within the 'figures' folder to save the plot Returns ------- rrho_matrix1 Values for the first quadrant rrho_matrix2 Values for the second quadrant rrho_matrix3 Values for the third quadrant rrho_matrix4 Values for the forth quadrant Example ------- >>> _ = rrho(gtdata=gtdata,semisdata=semisdata,celltype_key='celltypes',celltype='CD4') """ print('Plotting RRHO for comparing ' + str(celltype) + ' markers.') sc.tl.rank_genes_groups(gtdata, celltype_key, method='t-test',rankby_abs=True) sc.tl.rank_genes_groups(semisdata, celltype_key, method='t-test',rankby_abs=True) population = semisdata.X.shape[1] nummarkers = 500 gt_posmarkers = [] gt_negmarkers = [] semi_posmarkers = [] semi_negmarkers = [] totaltypes = (gtdata.uns['rank_genes_groups']['names'].dtype).names totaltypes = list(totaltypes) j = totaltypes.index(celltype) # gt for i in range(nummarkers): g = gtdata.uns['rank_genes_groups']['names'][i][j] score = gtdata.uns['rank_genes_groups']['scores'][i][j] if score > 0: gt_posmarkers.append(g) else: gt_negmarkers.append(g) #semii for i in range(nummarkers): g = semisdata.uns['rank_genes_groups']['names'][i][j] score = semisdata.uns['rank_genes_groups']['scores'][i][j] if score > 0: semi_posmarkers.append(g) else: semi_negmarkers.append(g) ngenes = 50 gt_posmarkers = gt_posmarkers[:ngenes] semi_posmarkers = semi_posmarkers[:ngenes] gt_negmarkers = gt_negmarkers[:ngenes] semi_negmarkers = semi_negmarkers[:ngenes] rmat = rrho_plot(name = name,\ list1 = gt_posmarkers, \ list2 = semi_posmarkers,\ list3 = gt_negmarkers,\ list4 = semi_negmarkers,\ celltype = celltype, population=population, upperbound = 50,save=save) return rmat
[docs]def enrichment_comparison(name:str, gtdata:anndata.AnnData, semisdata:anndata.AnnData, celltype_key:str, selectedtype:str, save = None)-> Tuple[np.array,np.array,np.array,np.array]: """ Compare the enrichment analysis results using the real-profiled and semi-profiled datasets. Parameters ---------- name: Project name gtdata: Real-profiled (ground truth) data semisdata: Semi-profiled dataset celltype_key: The key in anndata.AnnData.obs for storing the cell type information selectedtype: The selected cell type to analyze save Path within the 'figures' folder to save the plot Returns ------- CommonDEGs The number of overlapping DEGs between real and semi-profiled data HypergeometricP P-value of hypergeometric test examining the overlap between two versions of DEGs PearsonR Pearson correlation between bar lengths in real-profiled and semi-profiled bar plots PearsonP P-value of the Pearson correlation test Example ------- >>> _ = enrichment_comparison(name, gtdata, semisdata, celltype_key = 'celltypes', selectedtype = 'CD4') """ totaltypes = np.unique(gtdata.obs[celltype_key]) sc.tl.rank_genes_groups(gtdata, celltype_key, method='t-test') typededic = {} for j in range(totaltypes.shape[0]): celltype = totaltypes[j] typede = [] for i in range(100): g = gtdata.uns['rank_genes_groups']['names'][i][j] typede.append(g) typededic[celltype] = typede sc.tl.rank_genes_groups(semisdata, celltype_key, method='t-test') semitypededic = {} for j in range(totaltypes.shape[0]): celltype = totaltypes[j] typede = [] for i in range(100): g = semisdata.uns['rank_genes_groups']['names'][i][j] typede.append(g) semitypededic[celltype] = typede c=0 gtdeg = typededic[selectedtype] semideg = semitypededic[selectedtype] for i in semideg: if i in gtdeg: c+=1 hyperpval = hypert(semisdata.X.shape[1],100,100,c) print('p-value of hypergeometric test for overlapping DEGs:', str(float(hyperpval))) if (os.path.isdir(name + '/gseapygt')) == False: os.system('mkdir ' + name + '/gseapygt') results = gseapy.enrichr(gene_list=gtdeg, gene_sets='GO_Biological_Process_2021',outdir=name + '/gseapygt') f=open(name + '/gseapygt/GO_Biological_Process_2021.human.enrichr.reports.txt','r') lines=f.readlines() f.close() gtsets=[] gtps=[] gtdic={} for l in lines[1:]: term = l.split('\t')[1] p = float(l.split('\t')[4]) gtsets.append(term) gtps.append(p) gtdic[term] = p results = gseapy.enrichr(gene_list=semideg, gene_sets='GO_Biological_Process_2021',outdir=name + '/gseapysemi') f=open(name + '/gseapysemi/GO_Biological_Process_2021.human.enrichr.reports.txt','r') lines=f.readlines() f.close() semisets=[] semips=[] semidic={} for l in lines[1:]: term = l.split('\t')[1] p = float(l.split('\t')[4]) semisets.append(term) semips.append(p) semidic[term]=p terms = copy.deepcopy(gtsets[:10]) real_data = copy.deepcopy(gtps[:10]) sim_data = [] for i in range(10): gtterm = semisets[i] if gtterm not in semidic.keys(): sim_data.append(1) else: sim_data.append(semidic[gtterm]) for i in range(10): if semisets[i] in terms: continue terms.append(semisets[i]) sim_data.append(semips[i]) if semisets[i] not in gtdic.keys(): real_data.append(1) else: real_data.append(gtdic[semisets[i]]) real_data = np.flip(real_data) sim_data = np.flip(sim_data) terms = np.flip(terms) sim_bar_lengths = [-np.log10(p) for p in sim_data] real_bar_lengths = [-np.log10(p) for p in real_data] res = scipy.stats.pearsonr(np.array(sim_bar_lengths),np.array(real_bar_lengths)) print('Significance correlation:',res) fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(8, 5)) bar_width = 0.4 y = np.arange(len(sim_data))+1 ax1.barh(y, real_bar_lengths, height=bar_width, color='green', label='Real') ax1.set_xlabel('-log10(p)') ax1.set_ylabel('Term') ax1.set_title('Real Data ('+str(len(semideg))+' DEGs)') ax2.barh(y, sim_bar_lengths, height=bar_width, color='blue', label='Simulated') ax2.set_xlabel('-log10(p)') ax2.set_title('Semi-profiled Data('+str(len(gtdeg))+' DEGs)') max_val = max(max(sim_bar_lengths), max(real_bar_lengths)) ax1.set_xlim(0,max_val + 1) ax2.set_xlim(0, max_val + 1) ax1.invert_xaxis() ax1.set_yticks(y) ax2.set_yticklabels(terms) fig.suptitle(selectedtype + ' GO ('+str(c)+' Overlap DEGs)') if save!= None: plt.savefig(name + '/figures/'+save+selectedtype + ' GO.pdf',bbox_inches='tight') plt.savefig(name + '/figures/'+save+selectedtype + ' GO.jpg',dpi=600,bbox_inches='tight') plt.show() return c, float(hyperpval), res[0], res[1]
[docs]def faiss_knn(query:np.array, x:np.array, n_neighbors:int=1) -> np.array: """ Compute distances from a vector to its K-nearest neighbros in a matrix. Parameters ---------- query: The query vector X: The data matrix n_neighbors: How many neighbors to consider? Returns ------- weights: distances Example ------- >>> faiss_knn(ma,mb,n_neighbors=1) """ n_samples = x.shape[0] n_features = x.shape[1] x = np.ascontiguousarray(x) index = faiss.IndexFlatL2(n_features) #index = faiss.IndexFlatIP(n_features) index.add(x) if n_neighbors < 2: neighbors = 2 else: neighbors = n_neighbors weights, targets = index.search(query, neighbors) #sources = np.repeat(np.arange(n_samples), neighbors) #targets = targets.flatten() #weights = weights.flatten() weights = weights[:,:n_neighbors] if -1 in targets: raise InternalError("Not enough neighbors were found. Please consider " "reducing the number of neighbors.") return weights
[docs]def get_error(name:str,scpath:str = 'example_data/scdata.h5ad')->Tuple[list,list,list,list]: """ Conclude the semi-profiling history of a project and output the erros, upperbounds, and lower bounds, which are necessary for overall performance evaluation. Parameters ---------- name: Project name scpath: Path to single-cell data Returns ------- upperbounds The error upper bounds calculated in each round lowerbounds The error lower bounds calculated in each round semierrors The errors of semi-profiling naiveerrors The errors of the selection-only method Example ------- >>> upperbounds, lowerbounds, semierrors, naiveerrors = get_error(name) """ # load gound truth print('loading and processing ground truth data.') st = timeit.default_timer() gtdata = anndata.read_h5ad(scpath) sids = [] f = open(name + '/sids.txt','r') lines = f.readlines() for l in lines: sids.append(l.strip()) f.close() gts = [] for i in range(len(sids)): sid = sids[i] adata = gtdata[gtdata.obs['sample_ids'] == sid] x = adata.X if scipy.sparse.issparse(x): x = x.todense() x = np.array(x) if x.max()>20: x = np.log1p(x) gts.append(np.array(x)) ed = timeit.default_timer() print('finished processing ground truth',str(ed-st),' seconds') # compute semi-profile error for each round upperbounds = [] lowerbounds = [] naiveerrors = [] semierrors = [] print('computing error for each round') for rnd in range(1,len(os.listdir(name+'/status'))//2+1): print('round ',str(rnd)) # read representative and cluster labels info if rnd == 1: reprefile = name + '/status/init_representatives.txt' clusterfile = name + '/status/init_cluster_labels.txt' else: reprefile = name + '/status/eer_representatives_' + str(rnd) + '.txt' clusterfile = name + '/status/eer_cluster_labels_' + str(rnd) + '.txt' representatives = [] f = open(reprefile,'r') lines = f.readlines() for l in lines: representatives.append(int(l.strip())) f.close() cluster_labels = [] f = open(clusterfile,'r') lines = f.readlines() for l in lines: cluster_labels.append(int(l.strip())) f.close() print('loading semi-profiled cohort') st = timeit.default_timer() # semi-profiled cohort semis = [] for i in range(len(sids)): if i in representatives: semis.append(gts[i]) else: sid = sids[i] represid = sids[representatives[cluster_labels[i]]] xinf = np.load(name + '/inferreddata/' + represid + '_to_' + sid + '.npy') if xinf.max() > 20: xinf = np.log1p(xinf) semis.append(xinf) ed = timeit.default_timer() print(str(ed-st),'for loading semi-profiled cohort.') print('pca') # pca t_start = timeit.default_timer() X = np.concatenate([np.concatenate(gts,axis=0),np.concatenate(semis,axis=0)],axis=0) #X = np.log(X+1) reducer = PCA(n_components = 100)#PCA(n_components = 100)#,svd_solver = 'randomized')#randomized_svd(n_components=100) #PCA(n_components=100)# X_reduced = reducer.fit_transform(X) t_end = timeit.default_timer() print(str(t_end-t_start),'for pca') ### reduced data xdimgts=[] xdimsemis=[] offset=0 xused = X_reduced#X_UMAP # X_PCA for i in range(len(sids)): xdimgts.append(xused[offset:(offset+gts[i].shape[0]),:]) offset = offset+gts[i].shape[0] lengt = offset for i in range(len(sids)): xdimsemis.append(xused[offset:(offset+semis[i].shape[0]),:]) offset = offset+semis[i].shape[0] # error print('computing errors') st = timeit.default_timer() # lowerbound lbgt = copy.deepcopy(X_reduced) np.random.shuffle(lbgt) lbgt1 = lbgt[:lbgt.shape[0]//2,:] lbgt2 = lbgt[lbgt.shape[0]//2:,:] ma = np.array(lbgt1).copy(order='C') mb = np.array(lbgt2).copy(order='C') lowerbound = list(faiss_knn(ma,mb,n_neighbors=1)) + list(faiss_knn(mb,ma,n_neighbors=1)) lowerbound = np.array(lowerbound) lowerbound = lowerbound.mean() lowerbounds.append(lowerbound) #upperbound ubscores = [] for i in range(len(sids)): gt = xdimgts[i] randomidx = np.random.randint(0,len(sids)) gtr = xdimgts[randomidx] ma = np.array(gt).copy(order='C') mb = np.array(gtr).copy(order='C') ubscore = list(faiss_knn(ma,mb,n_neighbors=1)) + list(faiss_knn(mb,ma,n_neighbors=1)) ubscores.append(np.array(ubscore).mean()) upperbound = np.array(ubscores).mean() upperbounds.append(upperbound) #semi error scores = [] for i in range(len(sids)): pid = sids[i] if i in representatives: scores.append(lowerbound) continue gt = xdimgts[i] xs = xdimsemis[i] ma = np.array(gt).copy(order='C') mb = np.array(xs).copy(order='C') err1 = faiss_knn(ma,mb,n_neighbors=1) err2 = faiss_knn(mb,ma,n_neighbors=1) err = list(err1) + list(err2) err = (np.array(err)).mean() scores.append(err) semierror = np.array(scores).mean() semierrors.append(semierror) #naive error naivescores = [] for i in range(len(sids)): pid = sids[i] if i in representatives: naivescores.append(lowerbound) continue gt = xdimgts[i] repre = representatives[cluster_labels[i]] xs = xdimgts[repre] ma = np.array(gt).copy(order='C') mb = np.array(xs).copy(order='C') err1 = faiss_knn(ma,mb,n_neighbors=1) err2 = faiss_knn(mb,ma,n_neighbors=1) err = list(err1) + list(err2) err = (np.array(err)).mean() naivescores.append(err) naiveerror = np.array(naivescores).mean() naiveerrors.append(naiveerror) ed = timeit.default_timer() print(str(ed-st),'for computing error.') return upperbounds, lowerbounds, semierrors, naiveerrors
[docs]def errorcurve(upperbounds:list, lowerbounds:list, semierrors:list, naiveerrors:list, batch:int=2,total_samples:int = 12) -> None: """ Visualize the error and cost as more representatives are sequenced. Parameters ---------- upperbounds The error upper bounds calculated in each round lowerbounds The error lower bounds calculated in each round semierrors The errors of semi-profiling naiveerrors The errors of the selection-only method batch Representative selection batch size total_samples The total number of samples in the cohort Returns ------- None Example ------- >>> errorcurve(upperbounds, lowerbounds, semierrors, naiveerrors, batch=2,total_samples = 12) """ ub = np.mean(upperbounds) lb = np.mean(lowerbounds) semi = (np.array(semierrors) - lb)/(ub-lb) naive = (np.array(naiveerrors) - lb)/(ub-lb) y1 = naive y2 = semi x = np.linspace(batch,batch*len(y1),len(y1)) cost = [] for i in range(len(y1)): bulkcost = 7000 + total_samples * 110 sccost = 5000 * 0.3 * batch * i semicost = bulkcost + sccost cost.append(semicost/1000) # Create the main plot fig, ax1 = plt.subplots() ax1.plot(x, y1, color='tab:blue', label='Selection-only') ax1.set_xlabel('Number of Samples Sequenced') ax1.set_ylabel('Normalized Error') #ax1.tick_params(axis='y', labelcolor='tab:blue') ax1.plot(x, y2, color='tab:orange', label='Semi-profiling') ax1.set_ylabel('Normalized Error') #ax1.tick_params(axis='y', labelcolor='tab:red') # Create a secondary y-axis ax2 = ax1.twinx() ax2.plot(x, cost, color='tab:red', label='cost') ax2.set_ylabel('Cost (in k USD)') #ax2.tick_params(axis='y', labelcolor='tab:red') # Add legends lines, labels = ax1.get_legend_handles_labels() lines2, labels2 = ax2.get_legend_handles_labels() ax2.legend(lines + lines2, labels + labels2, loc='upper center') plt.title('Error and Cost as More Representatives Sequenced') return
def prop(gttypes ,semitypes, totaltypes): pgt = np.zeros(totaltypes.shape) psemi = np.zeros(totaltypes.shape) for i in range(len(totaltypes)): pgt[i] += (np.array(gttypes) == totaltypes[i]).sum() psemi[i] += (np.array(semitypes) == totaltypes[i]).sum() ## celltypes2 numgt=pgt numsemi=psemi pgt = pgt/pgt.sum() psemi = psemi/psemi.sum() pcor,pval = scipy.stats.pearsonr(pgt,psemi) return pcor,pval,pgt,psemi,numgt,numsemi def RMSE(p1,p2): p1 = np.array(p1) p2 = np.array(p2) mse = ((p1-p2)**2).mean() rmse = mse**0.5 return rmse def CCCscore(y_pred, y_true, mode='all'): # adapated from Yanshuo et.al's work TAPE # pred: shape{n sample, m cell} if mode == 'all': y_pred = y_pred.reshape(-1, 1) y_true = y_true.reshape(-1, 1) elif mode == 'avg': pass ccc_value = 0 for i in range(y_pred.shape[1]): r = np.corrcoef(y_pred[:, i], y_true[:, i])[0, 1] # Mean mean_true = np.mean(y_true[:, i]) mean_pred = np.mean(y_pred[:, i]) # Variance var_true = np.var(y_true[:, i]) var_pred = np.var(y_pred[:, i]) # Standard deviation sd_true = np.std(y_true[:, i]) sd_pred = np.std(y_pred[:, i]) # Calculate CCC numerator = 2 * r * sd_true * sd_pred denominator = var_true + var_pred + (mean_true - mean_pred) ** 2 ccc = numerator / denominator ccc_value += ccc return ccc_value / y_pred.shape[1]
[docs]def inspect_data(bulk:str,logged:bool=False,normed:bool = True, geneselection:Union[bool,int]=True, save=None) -> None: """ To accommodate users who prefer all-in-one-batch profiling, we have developed hits new function. This function allows users to assess data heterogeneity (using Silhouette scores and bulk data visualization) and determine an approximate number of representative clusters needed for their dataset. Once the number of clusters is established, users can sequence all representatives in one batch to minimize potential batch effects. Parameters ---------- name Project name. bulk Path to bulk data as an h5ad file. Sample IDs should be stored in adata.obs['sample_ids'] and gene names should be stored in adata.var.index. logged Whether the data has been logged or not normed Whether the library size has been normalized or not geneselection Either a boolean value indicating whether to perform gene selection using the bulk data or not, or a integer specifying the number of highly variable genes should be selected. save Specify a folder for saving figures Returns ------- None Example -------- >>> import scSemiProfiler as semi >>> from scSemiProfiler.utils import * >>> bulk = 'example_data/bulkdata.h5ad' >>> logged = False >>> normed = True >>> geneselection = False >>> inspect_data(bulk,logged,normed,geneselection) """ print('Presenting dataset heterogeneity information to decide sample batch size') if save != None: if (os.path.isdir(save)) == False: os.system('mkdir '+save) bulkdata = anndata.read_h5ad(bulk) if normed == False: if logged == True: print('Bad data preprocessing. Please normalize the library size before log-transformation.') return sc.pp.normalize_total(bulkdata, target_sum=1e4) if logged == False: sc.pp.log1p(bulkdata) # write sample ids sids = list(bulkdata.obs['sample_ids']) if geneselection == False: hvgenes = np.array(bulkdata.var.index) elif geneselection == True: sc.pp.highly_variable_genes(bulkdata, n_top_genes=6000) #sc.pp.highly_variable_genes(bulkdata, min_mean=0.0125, max_mean=3, min_disp=0.5) bulkdata = bulkdata[:, bulkdata.var.highly_variable] hvgenes = (np.array(bulkdata.var.index))[bulkdata.var.highly_variable] else: sc.pp.highly_variable_genes(bulkdata, n_top_genes = int(geneselection)) #sc.pp.highly_variable_genes(bulkdata, min_mean=0.0125, max_mean=3, min_disp=0.5) bulkdata = bulkdata[:, bulkdata.var.highly_variable] hvgenes = (np.array(bulkdata.var.index))[bulkdata.var.highly_variable] #dim reduction and clustering if bulkdata.X.shape[0]>100: n_comps = 100 else: n_comps = bulkdata.X.shape[0]-1 sc.tl.pca(bulkdata,n_comps=n_comps) sc.pp.neighbors(bulkdata) sc.tl.umap(bulkdata) sc.pl.umap(bulkdata) avgs = [] xaxis = [] for i in range(2,len(sids)): n_clusters = i# max(2,batch*i + batch) xaxis.append(n_clusters) clusterer = KMeans(n_clusters=n_clusters, random_state=i) X = bulkdata.obsm['X_pca'] cluster_labels = clusterer.fit_predict(X) silhouette_avg = silhouette_score(X, cluster_labels) avgs.append(silhouette_avg) plt.xlabel('Number of Clusters') plt.plot(xaxis,avgs) plt.title('Average Silhoette Scores') return
[docs]def global_stop_checking(name:str, representatives:list, n_targets:int = 2, celltype_key:str='celltypes' ) -> Tuple[list,list]: """ Checking if further representative selection is needed for the global mode. n_target samples will be randomly selected from the representatives as the dummy target samples. Other representatives will be used for infer the dummy target samples. A list of inferred samples and the corresponding round truth will be returned for the user to examine the similarity and decide if further representative selection is needed. Information regarding cell type deconvolution performance will also be presented. Parameters ---------- name Project name. representatives List of representative samples. cluster_labels List of cluster labels. n_target The number of dummy targets. celltype_key The key in adata.obs for storing cell types Returns ------- real_target_list List of real-profiled target ground truth inferred_target_list List of inferred target data for comparison Example -------- >>> import scSemiProfiler as semi >>> from scSemiProfiler.utils import * >>> name = 'project_name' >>> representatives = [6, 10, 5, 7] >>> real_target_list, inferred_target_list = global_stop_checking(name = name, representatives =representatives, n_target = 2 ) >>> # downstram analysis comparing real targets and inferred targets """ sids = [] f = open(name + '/sids.txt','r') lines = f.readlines() for l in lines: sids.append(l.strip()) f.close() repsdata = [] for i in range(len(representatives)): r = representatives[i] repsdata.append(anndata.read_h5ad(name + '/sample_sc/'+sids[r]+'.h5ad')) repspseudobulk = [] for adata in repsdata: repspseudobulk.append( adata.X.mean(axis=0) ) repspseudobulk = np.array(repspseudobulk) repspseudobulk = repspseudobulk.reshape((-1, adata.X.shape[1])) repspseudobulk = anndata.AnnData(repspseudobulk) if repspseudobulk.X.max()>20: sc.pp.log1p(repspseudobulk) sc.tl.pca(repspseudobulk) dismtx = repspseudobulk.obsm['X_pca'] real_target_list = [] inferred_target_list = [] targets = random.sample(representatives,2) # find ref refs=[] for i in range(n_targets): t = dismtx[i] dists = ((dismtx - t)**2).sum(axis=1) sorted_indices = np.argsort(dists) index_second_smallest = sorted_indices[1] refs.append(representatives[index_second_smallest]) real_target_list.append(repsdata[index_second_smallest]) # infer for i in range(n_targets): tgt = targets[i] rep = refs[i] inferredtgt = tgtinfer(name = name, representative = rep, target = tgt, device = 'cuda:0') for i in range(n_targets): tgt = targets[i] adata = anndata.read_h5ad(name + '/sample_sc/'+sids[tgt]+'.h5ad') inferred_target_list.append(adata) xtrain = [] ytrain = [] for i in range(len(real_target_list)): adata = real_target_list[i] xtrain.append(np.array(adata.X)) sample_celltype = list(adata.obs[celltype_key]) ytrain = ytrain + sample_celltype xtrain = np.concatenate(xtrain, axis=0) if xtrain.max()>10: xtrain = np.log1p(xtrain) ytrain = np.array(ytrain) totaltypes = np.unique(ytrain) annotator = MLPClassifier(hidden_layer_sizes=(200,)) annotator.fit((xtrain),ytrain) print('Pearson correlation between cell type proportions in inferred and ground truth samples:') for i in range(n_targets): x = inferred_target_list[i].X if x.max()>20: x= np.log1p(x) predicted_celltype = annotator.predict(x) inferred_target_list[i].obs[celltype_key] = predicted_celltype realtypes = celltype_proportion(real_target_list[i],totaltypes) semitypes = celltype_proportion(inferred_target_list[i],totaltypes) print(scipy.stats.pearsonr(realtypes ,semitypes )) return real_target_list, inferred_target_list
def main(): parser=argparse.ArgumentParser(description="scSemiProfiler assemble_cohort") parser._action_groups.pop() required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') required.add_argument('--name',required=True, help="Project name.") required.add_argument('--representatives',required=True, help="Either a txt file including all representatives or a list of representatives.") required.add_argument('--cluster_labels',required=True, help="Either a txt file including cluster label information or a list of labels.") optional.add_argument('--celltype_key',required=False, default='celltypes', help="The key in the adata.obs indicating the cell type labels (Default: 'celltypes')") ### args = parser.parse_args() name = args.name representatives = args.representatives cluster_labels = args.cluster_labels celltype_key = args.celltype_key assemble_cohort(name, representatives, cluster_labels, celltype_key) if __name__=="__main__": main()