Integrating the three slices from the spatial-ATAC-seq ME dataset

This dataset contains one slice from the E11 developmental stage and two slices from the E13 stage, with the sequencing depth of the ME13_1 slice being significantly higher than that of the other two slices, resulting in over 590,000 peaks and significantly higher fragment counts. The other two slices have only about 290,000 peaks, but higher ratio of reads in transcription start sites (TSS) compared to the ME13_1 slice.

[ ]:
import os
import csv
import torch
import numpy as np
import anndata as ad

from sklearn.decomposition import PCA

import INSTINCT

import warnings
warnings.filterwarnings("ignore")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Load the raw data

[ ]:
save = True

data_dir = '../../data/spCASdata/HumanMouse_Deng2022/preprocessed/'
save_dir = '../../results/HumanMouse_Deng2022/'
slice_name_list = ['GSM5238385_ME11_50um', 'GSM5238386_ME13_50um', 'GSM5238387_ME13_50um_2']
slice_used = [0, 1, 2]
slice_name_list = [slice_name_list[i] for i in slice_used]
slice_index_list = list(range(len(slice_name_list)))

save_dir = f'../../results/HumanMouse_Deng2022/{slice_used}/'
if not os.path.exists(data_dir + f'{slice_used}/'):
    os.makedirs(data_dir + f'{slice_used}/')
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# load raw data
cas_list = []
for sample in slice_name_list:
    sample_data = ad.read_h5ad(data_dir + sample + '.h5ad')

    if 'insertion' in sample_data.obsm:
        del sample_data.obsm['insertion']

    cas_list.append(sample_data)

Merge the peaks

[ ]:
# merge peaks
cas_list = INSTINCT.peak_sets_alignment(cas_list)

# save the merged data
for idx, adata in enumerate(cas_list):
    adata.write_h5ad(data_dir + f'{slice_used}/merged_{slice_name_list[idx]}.h5ad')
[ ]:
# load the merged data
cas_list = [ad.read_h5ad(data_dir + f'{slice_used}/merged_{sample}.h5ad') for sample in slice_name_list]
for j in range(len(cas_list)):
    cas_list[j].obs_names = [x + '_' + slice_name_list[j] for x in cas_list[j].obs_names]
    if 'in_tissue' in cas_list[j].obs.keys():
        cas_list[j] = cas_list[j][cas_list[j].obs['in_tissue'] == 1, :]

# concatenation
adata_concat = ad.concat(cas_list, label="slice_name", keys=slice_name_list)
# adata_concat.obs_names_make_unique()
print(adata_concat.shape)

Data preprocessing

[ ]:
# preprocess CAS data
print('Start preprocessing')
INSTINCT.preprocess_CAS(cas_list, adata_concat, use_fragment_count=True, min_cells_rate=0.05)
print(adata_concat.shape)
print('Done!')
[ ]:
adata_concat.write_h5ad(save_dir + f"preprocessed_concat.h5ad")
for i in range(len(slice_name_list)):
    cas_list[i].write_h5ad(save_dir + f"filtered_merged_{slice_name_list[i]}.h5ad")

cas_list = [ad.read_h5ad(save_dir + f"filtered_merged_{sample}.h5ad") for sample in slice_name_list]
adata_concat = ad.read_h5ad(save_dir + f"preprocessed_concat.h5ad")

Perform PCA

[ ]:
print(f'Applying PCA to reduce the feature dimension to 100 ...')
pca = PCA(n_components=100, random_state=1234)
input_matrix = pca.fit_transform(adata_concat.X.toarray())
np.save(save_dir + f'input_matrix.npy', input_matrix)
print('Done !')

input_matrix = np.load(save_dir + f'input_matrix.npy')
adata_concat.obsm['X_pca'] = input_matrix

Create neighbor graph

[ ]:
# calculate the spatial graph
INSTINCT.create_neighbor_graph(cas_list, adata_concat)

Run model

[ ]:
INSTINCT_model = INSTINCT.INSTINCT_Model(cas_list, adata_concat, device=device)

INSTINCT_model.train(report_loss=True, report_interval=100)

INSTINCT_model.eval(cas_list)

result = ad.concat(cas_list, label="slice_idx", keys=slice_index_list)

if save:
    with open(save_dir + f'INSTINCT_embed.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerows(result.obsm['INSTINCT_latent'])