{ "cells": [ { "cell_type": "markdown", "id": "31ddc1c1-e45b-454f-a564-7f1e6fe05207", "metadata": {}, "source": [ "# Integrating three slices from spatial ATAC-RNA-seq MB dataset\n", "Utilize INSTINCT to integrate three samples of mouse postnatal day 21/22 (P21/22) brains generated by spatial ATAC-RNA-seq (spatial ATAC-RNA-seq MB), a multi-omics sequencing technique. \n", "It is worth noting that although these three slices were sequenced from similar developmental stages of the same organ, the first two slices (slice 0 and 1) have a size of barcodes, while the third slice (slice 2) contains barcodes. \n", "This resulted in the sequencing of brain tissues of different scales, with slice 2 essentially encompassing the entire hemisphere of the coronal brain, while slice 0 and 1 only contained approximately one-quarter of the size. " ] }, { "cell_type": "code", "execution_count": null, "id": "6e6c5691-770b-466c-b550-15be82217191", "metadata": {}, "outputs": [], "source": [ "import os\n", "import anndata as ad\n", "import numpy as np\n", "import torch\n", "import csv\n", "\n", "from sklearn.decomposition import PCA\n", "import INSTINCT\n", "\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')" ] }, { "cell_type": "markdown", "id": "8a918c30-3d27-4506-a50a-b584aa0b6f6e", "metadata": {}, "source": [ "### Load the raw data" ] }, { "cell_type": "code", "execution_count": null, "id": "d46df7a8-e2bd-4a3e-a9d9-354f2469f30e", "metadata": {}, "outputs": [], "source": [ "slice_name_list = [\"GSM6204623_MouseBrain_20um\", \"GSM6758284_MouseBrain_20um_repATAC\", \"GSM6758285_MouseBrain_20um_100barcodes_ATAC\"]\n", "rna_slice_name_list = [\"GSM6204636_MouseBrain_20um\", \"GSM6753041_MouseBrain_20um_repATAC\", \"GSM6753043_MouseBrain_20um_100barcodes_ATAC\"]\n", "slice_index_list = list(range(len(slice_name_list)))\n", "\n", "data_dir = '../../data/spMOdata/EpiTran_HumanMouse_Zhang2023/preprocessed_from_fragments/'\n", "save_dir = f'../../results/HumanMouse_Zhang2023/mb/'\n", "\n", "if not os.path.exists(data_dir + f'mb_merged/'):\n", " os.makedirs(data_dir + f'mb_merged/')\n", "if not os.path.exists(save_dir):\n", " os.makedirs(save_dir)\n", "\n", "# load raw data\n", "cas_list = []\n", "for sample in slice_name_list:\n", " sample_data = ad.read_h5ad(data_dir + sample + '.h5ad')\n", "\n", " if 'insertion' in sample_data.obsm:\n", " del sample_data.obsm['insertion']\n", "\n", " cas_list.append(sample_data)" ] }, { "cell_type": "markdown", "id": "cba40d92-0127-4ef6-9263-4862ba34310a", "metadata": {}, "source": [ "### Merge the peaks" ] }, { "cell_type": "code", "execution_count": null, "id": "fa2772a0-bcb0-4ca2-9914-94c6861bf887", "metadata": {}, "outputs": [], "source": [ "cas_list = INSTINCT.peak_sets_alignment(cas_list)\n", "\n", "# save the merged data\n", "for idx, adata in enumerate(cas_list):\n", " adata.write_h5ad(data_dir + f'mb_merged/merged_{slice_name_list[idx]}.h5ad')" ] }, { "cell_type": "code", "execution_count": null, "id": "a073871b-ae69-46b2-a493-eee4ec29442d", "metadata": {}, "outputs": [], "source": [ "# load the merged data\n", "cas_list = [ad.read_h5ad(data_dir + f'mb_merged/merged_{sample}.h5ad') for sample in slice_name_list]\n", "for j in range(len(cas_list)):\n", " cas_list[j].obs_names = [x + '_' + slice_name_list[j] for x in cas_list[j].obs_names]\n", "\n", "# read the raw RNA data\n", "rna_list = [ad.read_h5ad(data_dir + f'{sample}.h5ad') for sample in rna_slice_name_list]\n", "for j in range(len(rna_list)):\n", " rna_list[j].obs_names = [x + '-1_' + slice_name_list[j] for x in rna_list[j].obs_names]\n", " print(rna_list[j].shape)\n", "\n", "# filter spots that is not tissue\n", "for i in range(len(slice_name_list)):\n", " obs_list = [obs_name for obs_name in cas_list[i].obs_names if obs_name in rna_list[i].obs_names]\n", " cas_list[i] = cas_list[i][obs_list, :]\n", " print(cas_list[i].shape)\n", "\n", "# concatenation\n", "adata_concat = ad.concat(cas_list, label=\"slice_name\", keys=slice_name_list)\n", "# adata_concat.obs_names_make_unique()\n", "print(adata_concat.shape)" ] }, { "cell_type": "markdown", "id": "215f2ba4-c524-4f8b-9c63-5c52bb22174f", "metadata": {}, "source": [ "### Data preprocessing" ] }, { "cell_type": "code", "execution_count": null, "id": "cd7b1605-ce32-49a8-b5fc-9822d049666f", "metadata": {}, "outputs": [], "source": [ "# preprocess CAS data\n", "print('Start preprocessing')\n", "INSTINCT.preprocess_CAS(cas_list, adata_concat, use_fragment_count=True, min_cells_rate=0.02)\n", "print(adata_concat.shape)\n", "print('Done!')" ] }, { "cell_type": "code", "execution_count": null, "id": "c59b125d-0611-4989-a814-34a9b3795737", "metadata": {}, "outputs": [], "source": [ "adata_concat.write_h5ad(save_dir + f\"preprocessed_concat.h5ad\")\n", "for i in range(len(slice_name_list)):\n", " cas_list[i].write_h5ad(save_dir + f\"filtered_merged_{slice_name_list[i]}.h5ad\")\n", "\n", "cas_list = [ad.read_h5ad(save_dir + f\"filtered_merged_{sample}.h5ad\") for sample in slice_name_list]\n", "# origin_concat = ad.concat(cas_list, label=\"slice_idx\", keys=slice_index_list)\n", "adata_concat = ad.read_h5ad(save_dir + f\"preprocessed_concat.h5ad\")" ] }, { "cell_type": "markdown", "id": "c818625f-f1ab-436e-8718-338f5db448dd", "metadata": {}, "source": [ "### Perform PCA" ] }, { "cell_type": "code", "execution_count": null, "id": "0767875b-7989-4d8b-b8f5-fc3671d7049d", "metadata": {}, "outputs": [], "source": [ "print(f'Applying PCA to reduce the feature dimension to 100 ...')\n", "pca = PCA(n_components=100, random_state=1234)\n", "input_matrix = pca.fit_transform(adata_concat.X.toarray())\n", "np.save(save_dir + 'input_matrix.npy', input_matrix)\n", "print('Done !')\n", "\n", "input_matrix = np.load(save_dir + 'input_matrix.npy')\n", "adata_concat.obsm['X_pca'] = input_matrix" ] }, { "cell_type": "markdown", "id": "434de957-dc51-4f27-9d31-c06c654c5b60", "metadata": {}, "source": [ "### Create neighbor graph" ] }, { "cell_type": "code", "execution_count": null, "id": "e52a5727-027b-412b-af28-474e7e305362", "metadata": {}, "outputs": [], "source": [ "# calculate the spatial graph\n", "INSTINCT.create_neighbor_graph(cas_list, adata_concat)" ] }, { "cell_type": "markdown", "id": "a3455b98-610f-4073-910c-7b04aaa98c1a", "metadata": {}, "source": [ "### Data integration" ] }, { "cell_type": "code", "execution_count": null, "id": "d7407a51-368f-43ff-9caa-eb903011122f", "metadata": {}, "outputs": [], "source": [ "INSTINCT_model = INSTINCT.INSTINCT_Model(cas_list, adata_concat, device=device)\n", "\n", "INSTINCT_model.train(report_loss=True, report_interval=100)\n", "\n", "INSTINCT_model.eval(cas_list)" ] }, { "cell_type": "code", "execution_count": null, "id": "0d8f7251-a8b6-4dd1-bab5-887ae1e2dfac", "metadata": {}, "outputs": [], "source": [ "result = ad.concat(cas_list, label=\"slice_idx\", keys=slice_index_list)\n", "\n", "with open(save_dir + 'INSTINCT_embed.csv', 'w', newline='') as file:\n", " writer = csv.writer(file)\n", " writer.writerows(result.obsm['INSTINCT_latent'])\n", "\n", "with open(save_dir + 'INSTINCT_noise_embed.csv', 'w', newline='') as file:\n", " writer = csv.writer(file)\n", " writer.writerows(result.obsm['INSTINCT_latent_noise'])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 5 }