Continuous gene expression gradients in the motor cortex

import os
import numpy as np
import matplotlib.pyplot as plt
from glmpca import glmpca

from importlib import reload

import gaston
from gaston import neural_net,cluster_plotting, dp_related, segmented_fit, model_selection
from gaston import binning_and_plotting, isodepth_scaling, run_slurm_scripts
from gaston import spatial_gene_classification, plot_cell_types, filter_genes, process_NN_output

Step 1: Pre-processing

GASTON requires:

N x G counts matrix
N x 2 spatial coordinate matrix,
list of names for each gene

where N=number of spatial locations and G=number of genes

!mkdir -p motor_cortex_tutorial_outputs

coords_mat=np.load('motor_cortex_data/coords_mat.npy') # N x 2 spatial coordinate matrix 
counts_mat=np.load('motor_cortex_data/counts_mat.npy') # N x G UMI count array
gene_labels=np.load('motor_cortex_data/gene_labels.npy', allow_pickle=True) # array of names of G genes

num_dims=8 # 2 * number of clusters
penalty=10 # may need to increase if this is too small

glmpca_res=glmpca.glmpca(counts_mat.T, num_dims, fam="poi", penalty=penalty, verbose=True)
A = glmpca_res['factors'] # should be of size N x num_dims, where each column is a PC

np.save('motor_cortex_data/glmpca.npy', A)

Iteration: 0 | deviance=7.7347E+5
Iteration: 1 | deviance=7.7347E+5
Iteration: 2 | deviance=6.2336E+5
Iteration: 3 | deviance=4.1521E+5
Iteration: 4 | deviance=3.6895E+5
Iteration: 5 | deviance=3.5008E+5
Iteration: 6 | deviance=3.3981E+5
Iteration: 7 | deviance=3.3384E+5
Iteration: 8 | deviance=3.3011E+5
Iteration: 9 | deviance=3.2758E+5
Iteration: 10 | deviance=3.2573E+5
Iteration: 11 | deviance=3.2431E+5
Iteration: 12 | deviance=3.2318E+5
Iteration: 13 | deviance=3.2227E+5
Iteration: 14 | deviance=3.2151E+5
Iteration: 15 | deviance=3.2088E+5
Iteration: 16 | deviance=3.2035E+5
Iteration: 17 | deviance=3.1990E+5
Iteration: 18 | deviance=3.1951E+5
Iteration: 19 | deviance=3.1918E+5
Iteration: 20 | deviance=3.1888E+5
Iteration: 21 | deviance=3.1862E+5
Iteration: 22 | deviance=3.1839E+5
Iteration: 23 | deviance=3.1818E+5
Iteration: 24 | deviance=3.1799E+5
Iteration: 25 | deviance=3.1782E+5
Iteration: 26 | deviance=3.1766E+5
Iteration: 27 | deviance=3.1751E+5
Iteration: 28 | deviance=3.1737E+5
Iteration: 29 | deviance=3.1725E+5
Iteration: 30 | deviance=3.1713E+5
Iteration: 31 | deviance=3.1702E+5
Iteration: 32 | deviance=3.1691E+5
Iteration: 33 | deviance=3.1682E+5
Iteration: 34 | deviance=3.1673E+5
Iteration: 35 | deviance=3.1664E+5
Iteration: 36 | deviance=3.1656E+5
Iteration: 37 | deviance=3.1648E+5
Iteration: 38 | deviance=3.1641E+5
Iteration: 39 | deviance=3.1634E+5
Iteration: 40 | deviance=3.1628E+5
Iteration: 41 | deviance=3.1622E+5
Iteration: 42 | deviance=3.1616E+5
Iteration: 43 | deviance=3.1610E+5
Iteration: 44 | deviance=3.1605E+5
Iteration: 45 | deviance=3.1600E+5
Iteration: 46 | deviance=3.1595E+5
Iteration: 47 | deviance=3.1590E+5
Iteration: 48 | deviance=3.1586E+5
Iteration: 49 | deviance=3.1581E+5
Iteration: 50 | deviance=3.1577E+5
Iteration: 51 | deviance=3.1573E+5
Iteration: 52 | deviance=3.1569E+5
Iteration: 53 | deviance=3.1565E+5
Iteration: 54 | deviance=3.1562E+5
Iteration: 55 | deviance=3.1558E+5
Iteration: 56 | deviance=3.1555E+5
Iteration: 57 | deviance=3.1552E+5

# visualize top GLM-PCs
R=2
C=4
fig,axs=plt.subplots(R,C,figsize=(20,10))
for r in range(R):
    for c in range(C):
        i=r*C+c
        axs[r,c].scatter(coords_mat[:,0], coords_mat[:,1], c=A[:,i],cmap='Reds',s=4)
        axs[r,c].set_title(f'GLM-PC{i}')

../../_images/0de584e74e4920c4c0ed8658b4fa87189d063869f5dcfe6d137c467f777d9fd2.png

Step 2: Train GASTON neural network

We include how to train the neural network with two options: (1) a command line script and (2) in a notebook. We typically train the neural network 30 different times, each with a different seed, and we use the NN with lowest loss.

Option 1: Slurm

For option (1), the code below creates 30 different Slurm jobs, one for each initialization. To train the NN for a single initialization run the following command:

gaston -i /path/to/coords.npy -o /path/to/glmpca.npy -d /path/to/output_dir -e 10000 -c 500 -p 20 20 -x 20 20 -z adam -s SEED

for a given SEED value (integer)

NOTE: please wait for models to finish training before running below code. You can check on their status by running squeue -u uchitra (replacing uchitra with your username)

# LOAD DATA generated above
# path_to_glmpca='colorectal_tumor_data/glmpca.npy'
# path_to_coords='colorectal_tumor_data/coords_mat.npy'

# To approximately recreate paper figures, use same GLM-PCs from paper 
path_to_glmpca='motor_cortex_data/glmpca_from_paper.npy'
path_to_coords='motor_cortex_data/coords_mat.npy'

# GASTON NN parameters
isodepth_arch=[20,20] # architecture (two hidden layers of size 20) for isodepth neural network d(x,y)
expression_arch=[20,20] # architecture (two hidden layers of size 20) for 1-D expression function
epochs = 10000 # number of epochs to train NN
checkpoint = 500 # save model after number of epochs = multiple of checkpoint
optimizer = "adam"
num_restarts=30 # number of initializations

output_dir='motor_cortex_tutorial_outputs' # folder to save model runs

# REPLACE with your own conda environment name and path
conda_environment='gaston-package'
path_to_conda_folder='/n/fs/ragr-data/users/uchitra/miniconda3/bin/activate'

run_slurm_scripts.train_NN_parallel(path_to_coords, path_to_glmpca, isodepth_arch, expression_arch, 
                      output_dir, conda_environment, path_to_conda_folder,
                      epochs=epochs, checkpoint=checkpoint, 
                      num_seeds=num_restarts)

jobId: 20487119
jobId: 20487120
jobId: 20487121
jobId: 20487122
jobId: 20487123
jobId: 20487124
jobId: 20487125
jobId: 20487126
jobId: 20487127
jobId: 20487128
jobId: 20487129
jobId: 20487130
jobId: 20487131
jobId: 20487132
jobId: 20487133
jobId: 20487134
jobId: 20487135
jobId: 20487136
jobId: 20487137
jobId: 20487138
jobId: 20487139
jobId: 20487140
jobId: 20487141
jobId: 20487142
jobId: 20487143
jobId: 20487144
jobId: 20487145
jobId: 20487146
jobId: 20487147
jobId: 20487148

Option 2: train in notebook

path_to_glmpca='motor_cortex_data/glmpca_from_paper.npy'
path_to_coords='motor_cortex_data/coords_mat.npy'

A=np.load(path_to_glmpca) # GLM-PCA results used in manuscript
S=np.load(path_to_coords)

# z-score normalize S and A
S_torch, A_torch = neural_net.load_rescale_input_data(S,A)

######################################
# NEURAL NET PARAMETERS (USER CAN CHANGE)
# architectures are encoded as list, eg [20,20] means two hidden layers of size 20 hidden neurons
isodepth_arch=[20,20] # architecture for isodepth neural network d(x,y) : R^2 -> R 
expression_fn_arch=[20,20] # architecture for 1-D expression function h(w) : R -> R^G

num_epochs = 10000 # number of epochs to train NN (NOTE: it is sometimes beneficial to train longer)
checkpoint = 500 # save model after number of epochs = multiple of checkpoint
out_dir='motor_cortex_tutorial_outputs' # folder to save model runs
optimizer = "adam"
num_restarts=30
device='cuda' # change to 'cpu' if you don't have a GPU

######################################

seed_list=range(num_restarts)
for seed in seed_list:
    print(f'training neural network for seed {seed}')
    out_dir_seed=f"{out_dir}/rep{seed}"
    os.makedirs(out_dir_seed, exist_ok=True)
    mod, loss_list = neural_net.train(S_torch, A_torch,
                          S_hidden_list=isodepth_arch, A_hidden_list=expression_fn_arch, 
                          epochs=num_epochs, checkpoint=checkpoint, device=device,
                          save_dir=out_dir_seed, optim=optimizer, seed=seed, save_final=True)

Step 3: Process neural network output

We use the model trained for the paper for reproducibility. If you use the model trained above, then figures will closely match the manuscript — but not exactly match — due to PyTorch non-determinism in seeding (see https://github.com/pytorch/pytorch/issues/7068 ).

Visualize isodepth (topographic map) and spatial domains

Load best model

#gaston_model, A, S=process_NN_output.process_files('motor_cortex_tutorial_outputs')  # model trained above
gaston_model, A, S= process_NN_output.process_files('motor_cortex_data/reproduce_motor_cortex') # MATCH PAPER FIGURES

coords_mat=np.load('motor_cortex_data/coords_mat.npy') # N x 2 spatial coordinate matrix 
counts_mat=np.load('motor_cortex_data/counts_mat.npy') # N x G UMI count array
gene_labels=np.load('motor_cortex_data/gene_labels.npy', allow_pickle=True) # array of names of G genes

best model: motor_cortex_data/reproduce_motor_cortex/seed15

model_selection.plot_ll_curve(gaston_model, A, S, max_domain_num=8, start_from=2)

Kneedle number of domains: 4

../../_images/2b9aa2bf341eb708d62594e52bbd870f0c1d4e2341d3d80ce37603d3bfcc1431.png

Compute isodepth and GASTON domains

num_layers=4 # CHANGE FOR YOUR APPLICATION: use number of layers from above!
gaston_isodepth, gaston_labels=dp_related.get_isodepth_labels(gaston_model,A,S,num_layers)

gaston_isodepth=isodepth_scaling.adjust_isodepth(gaston_isodepth, gaston_labels, coords_mat, 
                                 q_vals=[0.1, 0.15, 0.2, 0.2], scale_factor=0.05)

Plot topographic map: isodepth and spatial gradients

show_streamlines=False
cluster_plotting.plot_isodepth(gaston_isodepth, S, gaston_model, figsize=(7,5), streamlines=show_streamlines, streamlines_lw=0.3, cmap="Reds")

../../_images/03b4bb58eb2344b78299e0cb7d5cde3a8b50a1c2c4f7e188c0c7120f425cef2f.png

Plot GASTON domains

domain_colors=['lightcoral', 'deepskyblue', 'darkorange', 'yellowgreen']
cluster_plotting.plot_clusters(gaston_labels, S, figsize=(5.5,5), 
                               colors=domain_colors, s=20, lgd=False, 
                               show_boundary=True, gaston_isodepth=gaston_isodepth, boundary_lw=3)

../../_images/35ef64bfaa9c53731a7666d41e03cdf5121b4b07baa23e877c89af68c28b4781.png

Continuous gradient analysis

Compute piecewise linear fit for every gene

umi_threshold = 0
pw_fit_dict=segmented_fit.pw_linear_fit(counts_mat, gaston_labels, gaston_isodepth,
    None, [], umi_threshold = umi_threshold)

Poisson regression for ALL cell types

100%|██████████| 242/242 [00:06<00:00, 35.41it/s]

binning_output=binning_and_plotting.bin_data(counts_mat, gaston_labels, gaston_isodepth, 
    None, gene_labels, num_bins_per_domain=[7,7,5,6], umi_threshold=umi_threshold, remove_unused_bins=True, min_cells_per_bin=8)

Find discontinuous and continuous genes

q=0.9 # use 0.9 quantile for slopes, discontinuities
discont_genes_layer=spatial_gene_classification.get_discont_genes(pw_fit_dict, binning_output,q=q)
cont_genes_layer=spatial_gene_classification.get_cont_genes(pw_fit_dict, binning_output,q=q)

Visualize piecewise linear fits

gene_name='Wipf3'
print(f'gene {gene_name}: discontinuous after domain(s) {discont_genes_layer[gene_name]}') 
print(f'gene {gene_name}: continuous in domain(s) {cont_genes_layer[gene_name]}')

# display log CPM (if you want to do CP500, set offset=500)
offset=10**6

binning_and_plotting.plot_gene_pwlinear(gene_name, pw_fit_dict, gaston_labels, gaston_isodepth, 
                                        binning_output, cell_type_list=None, pt_size=70, colors=domain_colors, 
                                        linear_fit=True, ticksize=15, figsize=(7,2.5), offset=offset, lw=2,
                                       domain_boundary_plotting=True)

gene Wipf3: discontinuous after domain(s) []
gene Wipf3: continuous in domain(s) []

../../_images/cf6b65a0f7adcebc23beb3d4fa2b8277bedd4f86df0a9f7d99b1aa524afdc966.png

Visualize raw expression values and piecewise linear function learned by GASTON

binning_and_plotting.plot_gene_raw(gene_name, gene_labels, counts_mat, S, vmin=8.5, figsize=(6,4))
plt.title(f'{gene_name} Raw Expression')
binning_and_plotting.plot_gene_function(gene_name, S, pw_fit_dict, gaston_labels, gaston_isodepth, binning_output, figsize=(6,4))
plt.title(f'{gene_name} GASTON Function')
plt.show()

../../_images/0db34a5d80dd33173e96aa9c9a774cbc4027e53fa3c22450ac0262309587396a.png

../../_images/6cae745cbca0154e416fa12862a18fd5204cb26b99fd4c23d48b643855c26e76.png

gene_name='Camk2d'
print(f'gene {gene_name}: discontinuous after domain(s) {discont_genes_layer[gene_name]}') 
print(f'gene {gene_name}: continuous in domain(s) {cont_genes_layer[gene_name]}')

# display log CPM (if you want to do CP500, set offset=500)
offset=10**6

binning_and_plotting.plot_gene_pwlinear(gene_name, pw_fit_dict, gaston_labels, gaston_isodepth, 
                                        binning_output, cell_type_list=None, pt_size=70, colors=domain_colors, 
                                        linear_fit=True, ticksize=15, figsize=(7,2.5), offset=offset, lw=2,
                                       domain_boundary_plotting=True)

gene Camk2d: discontinuous after domain(s) [0]
gene Camk2d: continuous in domain(s) [1]

../../_images/e64a16f6c218458d015fdeae558c246453ebfbd16b2dd09ff5bcc98f5092d358.png

binning_and_plotting.plot_gene_raw(gene_name, gene_labels, counts_mat, S, vmin=7.5, figsize=(6,4))
plt.title(f'{gene_name} Raw Expression')
binning_and_plotting.plot_gene_function(gene_name, S, pw_fit_dict, gaston_labels, gaston_isodepth, binning_output, figsize=(6,4))
plt.title(f'{gene_name} GASTON Function')
plt.show()

../../_images/c175eea9d17d0710504ab05ab50b5450ebff0ff6788fa8637d850af0bfc18eba.png

../../_images/f5691d53ac4e281dedcdb4b0a4977a6c21f62e5c79a70b44bb0a74c9f4b38c4f.png

gene_name='Marcksl1'
print(f'gene {gene_name}: discontinuous after domain(s) {discont_genes_layer[gene_name]}') 
print(f'gene {gene_name}: continuous in domain(s) {cont_genes_layer[gene_name]}')

# display log CPM (if you want to do CP500, set offset=500)
offset=10**6

binning_and_plotting.plot_gene_pwlinear(gene_name, pw_fit_dict, gaston_labels, gaston_isodepth, 
                                        binning_output, cell_type_list=None, pt_size=70, colors=domain_colors, 
                                        linear_fit=True, ticksize=15, figsize=(7,3), offset=offset, lw=2,
                                       domain_boundary_plotting=True)

gene Marcksl1: discontinuous after domain(s) []
gene Marcksl1: continuous in domain(s) [1, 2]

../../_images/ed5d8fbbfdbeed773a8f23a31d5816ecf66a8d5c06d606c3802a31f1593a5dba.png

binning_and_plotting.plot_gene_raw(gene_name, gene_labels, counts_mat, S, vmin=5, figsize=(6,4))
plt.title(f'{gene_name} Raw Expression')
binning_and_plotting.plot_gene_function(gene_name, S, pw_fit_dict, gaston_labels, gaston_isodepth, binning_output, figsize=(6,4))
plt.title(f'{gene_name} GASTON Function')
plt.show()

../../_images/6315ab8156f5d63fd918edb6777e27dc6a10b21c25c95fcbc330fe618b291caa.png

../../_images/dff2bab56c1cd52d5e23922ace0fe8a9ada71e027b709229c7bc22c5573f4a4e.png