{ "cells": [ { "cell_type": "markdown", "id": "4a7e2cdf-fbb7-4b0e-9c7b-d3d4ed79b864", "metadata": {}, "source": [ "# Gene ontology (GO) analysis\n", "Performing GO analysis based on the differentially expressed genes (DEGs) determined based on the annotation results of S2 slice." ] }, { "cell_type": "code", "execution_count": null, "id": "c4345e61-ea54-450f-b43d-76fe1b5a1a85", "metadata": {}, "outputs": [], "source": [ "import os\n", "import anndata as ad\n", "import scanpy as sc\n", "import pandas as pd\n", "import gseapy as gp\n", "\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")" ] }, { "cell_type": "markdown", "id": "d3915236-f31a-4b47-a996-6dd4f57293c7", "metadata": {}, "source": [ "### Load the data\n", "Load both the annotated spATAC-seq data and their corresponding SRT data, since MISAR-seq is a multi-omics data which sequenced both chromatin accessibility and RNA of each slice." ] }, { "cell_type": "code", "execution_count": null, "id": "d0574e07-f190-4b06-b548-a7705142811d", "metadata": {}, "outputs": [], "source": [ "model = 'INSTINCT'\n", "\n", "mode_list = ['E11_0', 'E13_5', 'E15_5', 'E18_5']\n", "mode_index = 3\n", "mode = mode_list[mode_index]\n", "\n", "data_dir = '../../data/spMOdata/EpiTran_MouseBrain_Jiang2023/preprocessed/'\n", "# sc.settings.set_figure_params(dpi=300, facecolor=\"white\")\n", "\n", "save_dir = f'../../results/MouseBrain_Jiang2023/vertical/{mode}/'\n", "if not os.path.exists(save_dir + f'{model}/DEGs/S1/'):\n", " os.makedirs(save_dir + f'{model}/DEGs/S1/')\n", "if not os.path.exists(save_dir + f'{model}/DEGs/S2/'):\n", " os.makedirs(save_dir + f'{model}/DEGs/S2/')\n", "slice_name_list = [f'{mode}-S1', f'{mode}-S2']\n", "\n", "# read the filtered and annotated CAS data\n", "cas_s1 = ad.read_h5ad(save_dir + f'filtered_merged_{slice_name_list[0]}_atac.h5ad')\n", "cas_s2 = ad.read_h5ad(save_dir + f'{model}/annotated_{slice_name_list[1]}_atac.h5ad')\n", "cas_list = [cas_s1, cas_s2]\n", "\n", "# read the raw RNA data\n", "rna_list = [ad.read_h5ad(data_dir + f'{sample}_expr.h5ad') for sample in slice_name_list]\n", "for j in range(len(rna_list)):\n", " rna_list[j].obs_names = [x + '_' + slice_name_list[j] for x in rna_list[j].obs_names]\n", "\n", "# filter and reorder spots in rna slices\n", "obs_list = [obs_name for obs_name in cas_list[0].obs_names if obs_name in rna_list[0].obs_names]\n", "cas_list[0] = cas_list[0][obs_list, :]\n", "rna_list[0] = rna_list[0][obs_list, :]\n", "# rename the obs in CAS s2\n", "cas_list[1].obs_names = [obs_name.split('_')[0] + f'-1_{slice_name_list[1]}' for obs_name in cas_list[1].obs_names]\n", "obs_list = [obs_name for obs_name in cas_list[1].obs_names if obs_name in rna_list[1].obs_names]\n", "cas_list[1] = cas_list[1][obs_list, :]\n", "rna_list[1] = rna_list[1][obs_list, :]\n", "\n", "# transfer the annotated labels from cas s2 slice to rna s2 slice\n", "rna_list[1].obs['predicted_labels'] = cas_list[1].obs['predicted_labels'].copy()\n", "\n", "# [cas_s1, cas_s2] = cas_list\n", "[rna_s1, rna_s2] = rna_list" ] }, { "cell_type": "markdown", "id": "b73783bb-4aee-4385-8ffb-d847d5c69de2", "metadata": {}, "source": [ "### Data preprocessing" ] }, { "cell_type": "code", "execution_count": null, "id": "4202bba8-8ee2-447f-9b29-0cece763ab57", "metadata": {}, "outputs": [], "source": [ "# normalization\n", "sc.pp.normalize_total(rna_s1, target_sum=1e4)\n", "sc.pp.log1p(rna_s1)\n", "rna_s1.var_names_make_unique()\n", "sc.pp.normalize_total(rna_s2, target_sum=1e4)\n", "sc.pp.log1p(rna_s2)\n", "rna_s2.var_names_make_unique()" ] }, { "cell_type": "markdown", "id": "1282e806-f6a4-4aa8-80cb-c31059c3d3ff", "metadata": {}, "source": [ "### Find DEGs" ] }, { "cell_type": "code", "execution_count": null, "id": "876d60ab-5872-4942-a701-37903c129efe", "metadata": {}, "outputs": [], "source": [ "# find DEGs\n", "group_list = list(set(rna_s2.obs['predicted_labels']))\n", "print(group_list)\n", "# predicted_labels = cas_s2.obs['predicted_labels']\n", "# label_counts = predicted_labels.value_counts()\n", "# labels_with_30plus_points = label_counts[label_counts > 30].index.tolist()\n", "# print(\"Labels with more than 30 spots:\")\n", "# print(labels_with_30plus_points)\n", "\n", "print('S2')\n", "rna_s2_degs_list = []\n", "sc.tl.rank_genes_groups(rna_s2, \"predicted_labels\", groups=group_list, method='wilcoxon')\n", "rna_s2_genes = pd.DataFrame(rna_s2.uns[\"rank_genes_groups\"][\"names\"])\n", "rna_s2_logfoldchanges = pd.DataFrame(rna_s2.uns[\"rank_genes_groups\"][\"logfoldchanges\"])\n", "rna_s2_pvals_adj = pd.DataFrame(rna_s2.uns[\"rank_genes_groups\"][\"pvals_adj\"])\n", "for col in list(rna_s2_genes.columns):\n", " s2_genes = rna_s2_genes[col].tolist()\n", " s2_logfoldchanges = rna_s2_logfoldchanges[col].tolist()\n", " s2_pvals_adj = rna_s2_pvals_adj[col].tolist()\n", " s2_degs_list = [s2_genes[i] for i in range(len(s2_genes)) if s2_logfoldchanges[i] > 0.2 and s2_pvals_adj[i] < 0.05]\n", " rna_s2_degs_list.append(s2_degs_list)\n", " # save DEGs\n", " if not s2_degs_list:\n", " with open(save_dir + f'{model}/DEGs/S2/{col}_DEGs.txt', 'w') as f:\n", " pass\n", " else:\n", " with open(save_dir + f'{model}/DEGs/S2/{col}_DEGs.txt', 'w') as f:\n", " for item in s2_degs_list:\n", " f.write(item + '\\n')\n", " print(f\"Label: {col}, Number of DEGs: {len(s2_degs_list)}\")\n", "\n", "print('\\nS1')\n", "rna_s1_degs_list = []\n", "sc.tl.rank_genes_groups(rna_s1, \"Annotation_for_Combined\", groups=group_list, method='wilcoxon')\n", "rna_s1_genes = pd.DataFrame(rna_s1.uns[\"rank_genes_groups\"][\"names\"])\n", "rna_s1_logfoldchanges = pd.DataFrame(rna_s1.uns[\"rank_genes_groups\"][\"logfoldchanges\"])\n", "rna_s1_pvals_adj = pd.DataFrame(rna_s1.uns[\"rank_genes_groups\"][\"pvals_adj\"])\n", "for col in list(rna_s1_genes.columns):\n", " s1_genes = rna_s1_genes[col].tolist()\n", " s1_logfoldchanges = rna_s1_logfoldchanges[col].tolist()\n", " s1_pvals_adj = rna_s1_pvals_adj[col].tolist()\n", " s1_degs_list = [s1_genes[i] for i in range(len(s1_genes)) if s1_logfoldchanges[i] > 0.2 and s1_pvals_adj[i] < 0.05]\n", " rna_s1_degs_list.append(s1_degs_list)\n", " # save DEGs\n", " if not s1_degs_list:\n", " with open(save_dir + f'{model}/DEGs/S1/{col}_DEGs.txt', 'w') as f:\n", " pass\n", " else:\n", " with open(save_dir + f'{model}/DEGs/S1/{col}_DEGs.txt', 'w') as f:\n", " for item in s1_degs_list:\n", " f.write(item + '\\n')\n", " print(f\"Label: {col}, Number of DEGs: {len(s1_degs_list)}\")" ] }, { "cell_type": "markdown", "id": "5aba6092-b3db-44fe-ad59-c70627dcd8fd", "metadata": {}, "source": [ "### Perform GO analysis" ] }, { "cell_type": "code", "execution_count": null, "id": "1a59ce45-aa38-4fc9-9ab9-3145010ab632", "metadata": {}, "outputs": [], "source": [ "# GO analysis\n", "# names = gp.get_library_name(organism='mouse')\n", "for i, col in enumerate(list(rna_s2_genes.columns)):\n", " if not rna_s2_degs_list[i]:\n", " print(f'{col} Skipped')\n", " continue\n", " print(col)\n", " enr = gp.enrichr(gene_list=rna_s2_degs_list[i], gene_sets='GO_Biological_Process_2021', organism='mouse',\n", " outdir=save_dir + f'{model}/GO_analysis/{col}/')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 5 }