import pdb,sys,os
import anndata
import scanpy as sc
import argparse
import copy
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from typing import Union
import matplotlib.pyplot as plt
[docs]def initsetup(name:str, bulk:str,logged:bool=False,normed:bool = True, geneselection:Union[bool,int]=True,batch:int=4) -> None:
"""
Initial setup of the semi-profiling pipeline, including processing the bulk data, clustering for finding the initial representatives. Bulk data should be provided as an 'h5ad' file. Sample IDs should be stored in adata.obs['sample_ids'] and gene names should be stored in adata.var.index. If not using active learning for iterative representative selection, directly set the batch size to be the total number of representatives desired.
Parameters
----------
name
Project name.
bulk
Path to bulk data as an h5ad file. Sample IDs should be stored in adata.obs['sample_ids'] and gene names should be stored in adata.var.index.
logged
Whether the data has been logged or not
normed
Whether the library size has been normalized or not
geneselection
Either a boolean value indicating whether to perform gene selection using the bulk data or not, or a integer specifying the number of highly variable genes should be selected.
batch
Representative selection batch size.
Returns
-------
None
Example
--------
>>> import scSemiProfiler
>>> name = 'runexample'
>>> bulk = 'example_data/bulkdata.h5ad'
>>> logged = False
>>> normed = True
>>> geneselection = False
>>> batch = 2
>>> scSemiProfiler.initsetup(name, bulk,logged,normed,geneselection,batch)
"""
print('Start initial setup')
if (os.path.isdir(name)) == False:
os.system('mkdir '+name)
else:
print(name + ' exists. Please choose another name.')
return
if (os.path.isdir(name+'/figures')) == False:
os.system('mkdir '+name+'/figures')
bulkdata = anndata.read_h5ad(bulk)
if normed == False:
if logged == True:
print('Bad data preprocessing. Please normalize the library size before log-transformation.')
return
sc.pp.normalize_total(bulkdata, target_sum=1e4)
if logged == False:
sc.pp.log1p(bulkdata)
# write sample ids
sids = list(bulkdata.obs['sample_ids'])
f = open(name+'/sids.txt','w')
for sid in sids:
f.write(sid+'\n')
f.close()
if geneselection == False:
hvgenes = np.array(bulkdata.var.index)
elif geneselection == True:
sc.pp.highly_variable_genes(bulkdata, n_top_genes=6000)
#sc.pp.highly_variable_genes(bulkdata, min_mean=0.0125, max_mean=3, min_disp=0.5)
bulkdata = bulkdata[:, bulkdata.var.highly_variable]
hvgenes = (np.array(bulkdata.var.index))[bulkdata.var.highly_variable]
else:
sc.pp.highly_variable_genes(bulkdata, n_top_genes = int(geneselection))
#sc.pp.highly_variable_genes(bulkdata, min_mean=0.0125, max_mean=3, min_disp=0.5)
bulkdata = bulkdata[:, bulkdata.var.highly_variable]
hvgenes = (np.array(bulkdata.var.index))[bulkdata.var.highly_variable]
np.save(name+'/hvgenes.npy',hvgenes)
#dim reduction and clustering
if bulkdata.X.shape[0]>100:
n_comps = 100
else:
n_comps = bulkdata.X.shape[0]-1
sc.tl.pca(bulkdata,n_comps=n_comps)
bulkdata.write(name + '/processed_bulkdata.h5ad')
#cluster
BATCH_SIZE = batch
kmeans = KMeans(n_clusters=BATCH_SIZE, random_state=1).fit(bulkdata.obsm['X_pca'])
cluster_labels = kmeans.labels_
#find representatives and cluster labels
pnums = []
for i in range(len(bulkdata.X)):
pnums.append(i)
pnums=np.array(pnums)
centers=[]
representatives=[]
repredic={}
for i in range(len(np.unique(cluster_labels))):
mask = (cluster_labels==i)
cluster = bulkdata.obsm['X_pca'][mask]
cluster_patients = pnums[mask]
center = cluster.mean(axis=0)
centers.append(center)
# find the closest patient
sqdist = ((cluster - center)**2).sum(axis=1)
cluster_representative = cluster_patients[np.argmin(sqdist)]
representatives.append(cluster_representative)
repredic[i] = cluster_representative
centers = np.array(centers)
#store representatives cluster labels
if (os.path.isdir(name + '/status')) == False:
os.system('mkdir ' + name + '/status')
f=open(name + '/status/init_cluster_labels.txt','w')
for i in range(len(cluster_labels)):
f.write(str(cluster_labels[i])+'\n')
f.close()
f=open(name + '/status/init_representatives.txt','w')
for i in range(len(representatives)):
f.write(str(representatives[i])+'\n')
f.close()
print('Initial setup finished. Among ' + str(len(sids)) + ' total samples, selected '+str(batch)+' representatives:')
for i in range(batch):
print(sids[representatives[i]])
return
def main():
parser=argparse.ArgumentParser(description="scSemiProfiler initsetup")
parser._action_groups.pop()
required = parser.add_argument_group('required arguments')
optional = parser.add_argument_group('optional arguments')
required.add_argument('--bulk',required=True,help="Input bulk data as a h5ad file. Sample IDs should be stored in obs.['sample_ids']. Gene symbols should be stored in var.index.")
required.add_argument('--name',required=True, help="Project name.")
optional.add_argument('--normed',required=False, default='no', help="Whether the library size normalization has already been done (Default: no)") ###
optional.add_argument('--geneselection',required=False,default='yes', help="Whether to perform highly variable gene selection: 'yes' or 'no'. (Default: yes)")
optional.add_argument('--batch',required=False, default=4, help="The representative sample batch size (Default: 4)")
args = parser.parse_args()
bulk = args.bulk
name = args.name
geneselection = args.geneselection
normed = args.normed
batch = int(args.batch)
initsetup(name,bulk,normed,geneselection,batch)
if __name__=="__main__":
main()