Integrating three slices from spatial ATAC-RNA-seq MB dataset
Utilize INSTINCT to integrate three samples of mouse postnatal day 21/22 (P21/22) brains generated by spatial ATAC-RNA-seq (spatial ATAC-RNA-seq MB), a multi-omics sequencing technique.
It is worth noting that although these three slices were sequenced from similar developmental stages of the same organ, the first two slices (slice 0 and 1) have a size of barcodes, while the third slice (slice 2) contains barcodes.
This resulted in the sequencing of brain tissues of different scales, with slice 2 essentially encompassing the entire hemisphere of the coronal brain, while slice 0 and 1 only contained approximately one-quarter of the size.
[ ]:
import os
import anndata as ad
import numpy as np
import torch
import csv
from sklearn.decomposition import PCA
import INSTINCT
import warnings
warnings.filterwarnings("ignore")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
Load the raw data
[ ]:
slice_name_list = ["GSM6204623_MouseBrain_20um", "GSM6758284_MouseBrain_20um_repATAC", "GSM6758285_MouseBrain_20um_100barcodes_ATAC"]
rna_slice_name_list = ["GSM6204636_MouseBrain_20um", "GSM6753041_MouseBrain_20um_repATAC", "GSM6753043_MouseBrain_20um_100barcodes_ATAC"]
slice_index_list = list(range(len(slice_name_list)))
data_dir = '../../data/spMOdata/EpiTran_HumanMouse_Zhang2023/preprocessed_from_fragments/'
save_dir = f'../../results/HumanMouse_Zhang2023/mb/'
if not os.path.exists(data_dir + f'mb_merged/'):
os.makedirs(data_dir + f'mb_merged/')
if not os.path.exists(save_dir):
os.makedirs(save_dir)
# load raw data
cas_list = []
for sample in slice_name_list:
sample_data = ad.read_h5ad(data_dir + sample + '.h5ad')
if 'insertion' in sample_data.obsm:
del sample_data.obsm['insertion']
cas_list.append(sample_data)
Merge the peaks
[ ]:
cas_list = INSTINCT.peak_sets_alignment(cas_list)
# save the merged data
for idx, adata in enumerate(cas_list):
adata.write_h5ad(data_dir + f'mb_merged/merged_{slice_name_list[idx]}.h5ad')
[ ]:
# load the merged data
cas_list = [ad.read_h5ad(data_dir + f'mb_merged/merged_{sample}.h5ad') for sample in slice_name_list]
for j in range(len(cas_list)):
cas_list[j].obs_names = [x + '_' + slice_name_list[j] for x in cas_list[j].obs_names]
# read the raw RNA data
rna_list = [ad.read_h5ad(data_dir + f'{sample}.h5ad') for sample in rna_slice_name_list]
for j in range(len(rna_list)):
rna_list[j].obs_names = [x + '-1_' + slice_name_list[j] for x in rna_list[j].obs_names]
print(rna_list[j].shape)
# filter spots that is not tissue
for i in range(len(slice_name_list)):
obs_list = [obs_name for obs_name in cas_list[i].obs_names if obs_name in rna_list[i].obs_names]
cas_list[i] = cas_list[i][obs_list, :]
print(cas_list[i].shape)
# concatenation
adata_concat = ad.concat(cas_list, label="slice_name", keys=slice_name_list)
# adata_concat.obs_names_make_unique()
print(adata_concat.shape)
Data preprocessing
[ ]:
# preprocess CAS data
print('Start preprocessing')
INSTINCT.preprocess_CAS(cas_list, adata_concat, use_fragment_count=True, min_cells_rate=0.02)
print(adata_concat.shape)
print('Done!')
[ ]:
adata_concat.write_h5ad(save_dir + f"preprocessed_concat.h5ad")
for i in range(len(slice_name_list)):
cas_list[i].write_h5ad(save_dir + f"filtered_merged_{slice_name_list[i]}.h5ad")
cas_list = [ad.read_h5ad(save_dir + f"filtered_merged_{sample}.h5ad") for sample in slice_name_list]
# origin_concat = ad.concat(cas_list, label="slice_idx", keys=slice_index_list)
adata_concat = ad.read_h5ad(save_dir + f"preprocessed_concat.h5ad")
Perform PCA
[ ]:
print(f'Applying PCA to reduce the feature dimension to 100 ...')
pca = PCA(n_components=100, random_state=1234)
input_matrix = pca.fit_transform(adata_concat.X.toarray())
np.save(save_dir + 'input_matrix.npy', input_matrix)
print('Done !')
input_matrix = np.load(save_dir + 'input_matrix.npy')
adata_concat.obsm['X_pca'] = input_matrix
Create neighbor graph
[ ]:
# calculate the spatial graph
INSTINCT.create_neighbor_graph(cas_list, adata_concat)
Data integration
[ ]:
INSTINCT_model = INSTINCT.INSTINCT_Model(cas_list, adata_concat, device=device)
INSTINCT_model.train(report_loss=True, report_interval=100)
INSTINCT_model.eval(cas_list)
[ ]:
result = ad.concat(cas_list, label="slice_idx", keys=slice_index_list)
with open(save_dir + 'INSTINCT_embed.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerows(result.obsm['INSTINCT_latent'])
with open(save_dir + 'INSTINCT_noise_embed.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerows(result.obsm['INSTINCT_latent_noise'])