Source code for scSemiProfiler.initial_setup

import pdb,sys,os
import anndata
import scanpy as sc
import argparse
import copy
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from typing import Union

import matplotlib.pyplot as plt



[docs]def initsetup(name:str, bulk:str,logged:bool=False,normed:bool = True, geneselection:Union[bool,int]=True,batch:int=4) -> None:
    """
    Initial setup of the semi-profiling pipeline, including processing the bulk data, clustering for finding the initial representatives. Bulk data should be provided as an 'h5ad' file. Sample IDs should be stored in adata.obs['sample_ids'] and gene names should be stored in adata.var.index. If not using active learning for iterative representative selection, directly set the batch size to be the total number of representatives desired.
    
    Parameters
    ----------
    name
        Project name. 
    bulk
        Path to bulk data as an h5ad file. Sample IDs should be stored in adata.obs['sample_ids'] and gene names should be stored in adata.var.index. 
    logged
        Whether the data has been logged or not
    normed
        Whether the library size has been normalized or not
    geneselection
        Either a boolean value indicating whether to perform gene selection using the bulk data or not, or a integer specifying the number of highly variable genes should be selected.
    batch 
        Representative selection batch size. 
    
    Returns
    -------
        None
    
    Example
    --------
    >>> import scSemiProfiler
    >>> name = 'runexample'
    >>> bulk = 'example_data/bulkdata.h5ad'
    >>> logged = False
    >>> normed = True
    >>> geneselection = False
    >>> batch = 2
    >>> scSemiProfiler.initsetup(name, bulk,logged,normed,geneselection,batch)

    """
    
    print('Start initial setup')
    
    if (os.path.isdir(name)) == False:
        os.system('mkdir '+name)
    else:
        print(name + ' exists. Please choose another name.')
        return
    
    if (os.path.isdir(name+'/figures')) == False:
        os.system('mkdir '+name+'/figures')
    
    bulkdata = anndata.read_h5ad(bulk)
    
    
    if normed == False:
        if logged == True:
            print('Bad data preprocessing. Please normalize the library size before log-transformation.')
            return
        sc.pp.normalize_total(bulkdata, target_sum=1e4)
    
    if logged == False:
        sc.pp.log1p(bulkdata)
        
    # write sample ids
    sids = list(bulkdata.obs['sample_ids'])
    f = open(name+'/sids.txt','w')
    for sid in sids:
        f.write(sid+'\n')
    f.close()
    
    
    if geneselection == False:
        hvgenes = np.array(bulkdata.var.index)
    elif geneselection == True:
        sc.pp.highly_variable_genes(bulkdata, n_top_genes=6000)
        #sc.pp.highly_variable_genes(bulkdata, min_mean=0.0125, max_mean=3, min_disp=0.5)
        bulkdata = bulkdata[:, bulkdata.var.highly_variable]
        hvgenes = (np.array(bulkdata.var.index))[bulkdata.var.highly_variable]
    else:
        sc.pp.highly_variable_genes(bulkdata, n_top_genes = int(geneselection))
        #sc.pp.highly_variable_genes(bulkdata, min_mean=0.0125, max_mean=3, min_disp=0.5)
        bulkdata = bulkdata[:, bulkdata.var.highly_variable]
        hvgenes = (np.array(bulkdata.var.index))[bulkdata.var.highly_variable]
    np.save(name+'/hvgenes.npy',hvgenes)
    
    #dim reduction and clustering
    
    if bulkdata.X.shape[0]>100:
        n_comps = 100
    else:
        n_comps = bulkdata.X.shape[0]-1
    
    sc.tl.pca(bulkdata,n_comps=n_comps)
    
    bulkdata.write(name + '/processed_bulkdata.h5ad')
    
    #cluster
    BATCH_SIZE = batch
    kmeans = KMeans(n_clusters=BATCH_SIZE, random_state=1).fit(bulkdata.obsm['X_pca'])
    cluster_labels = kmeans.labels_
    #find representatives and cluster labels
    pnums = []
    for i in range(len(bulkdata.X)):
        pnums.append(i)
    pnums=np.array(pnums)
    centers=[]
    representatives=[]
    repredic={}
    for i in range(len(np.unique(cluster_labels))):
        mask = (cluster_labels==i)
        cluster = bulkdata.obsm['X_pca'][mask]
        cluster_patients = pnums[mask]
        center = cluster.mean(axis=0)
        centers.append(center)
        # find the closest patient
        sqdist = ((cluster - center)**2).sum(axis=1)
        cluster_representative = cluster_patients[np.argmin(sqdist)]
        representatives.append(cluster_representative)
        repredic[i] = cluster_representative
    centers = np.array(centers)
    #store representatives cluster labels
    if (os.path.isdir(name + '/status')) == False:
        os.system('mkdir ' + name + '/status')
    

    f=open(name + '/status/init_cluster_labels.txt','w')
    for i in range(len(cluster_labels)):
        f.write(str(cluster_labels[i])+'\n')
    f.close()

    f=open(name + '/status/init_representatives.txt','w')
    for i in range(len(representatives)):
        f.write(str(representatives[i])+'\n')
    f.close()
    
    print('Initial setup finished. Among ' + str(len(sids)) + ' total samples, selected '+str(batch)+' representatives:')
    for i in range(batch):
        print(sids[representatives[i]])
    
    return





def main():
    parser=argparse.ArgumentParser(description="scSemiProfiler initsetup")
    parser._action_groups.pop()
    required = parser.add_argument_group('required arguments')
    optional = parser.add_argument_group('optional arguments')
    
    required.add_argument('--bulk',required=True,help="Input bulk data as a h5ad file. Sample IDs should be stored in obs.['sample_ids']. Gene symbols should be stored in var.index.")
    
    required.add_argument('--name',required=True, help="Project name.")
    
    optional.add_argument('--normed',required=False, default='no', help="Whether the library size normalization has already been done (Default: no)") ###
    
    optional.add_argument('--geneselection',required=False,default='yes', help="Whether to perform highly variable gene selection: 'yes' or 'no'. (Default: yes)")
    
    optional.add_argument('--batch',required=False, default=4, help="The representative sample batch size (Default: 4)")
    
    args = parser.parse_args()
    bulk = args.bulk
    name = args.name
    geneselection = args.geneselection
    normed = args.normed
    batch = int(args.batch)
    
    initsetup(name,bulk,normed,geneselection,batch)

if __name__=="__main__":
    main()