{ "cells": [ { "cell_type": "markdown", "id": "cf6a67ed-93b8-4149-862d-6134e98007bf", "metadata": {}, "source": [ "# Find DEGs" ] }, { "cell_type": "code", "execution_count": null, "id": "3713e817-c108-46bd-a94e-3bf1c6bef4f3", "metadata": {}, "outputs": [], "source": [ "import os\n", "import anndata as ad\n", "import scanpy as sc\n", "import pandas as pd\n", "\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "import matplotlib as mpl\n", "mpl.rcParams['pdf.fonttype'] = 42\n", "mpl.rcParams['ps.fonttype'] = 42\n", "\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")" ] }, { "cell_type": "code", "execution_count": null, "id": "e61c54a1-8a9d-47cc-8ded-fc5a41c4eb50", "metadata": {}, "outputs": [], "source": [ "save = True\n", "slice_name_list = [\"GSM6204623_MouseBrain_20um\", \"GSM6758284_MouseBrain_20um_repATAC\", \"GSM6758285_MouseBrain_20um_100barcodes_ATAC\"]\n", "rna_slice_name_list = [\"GSM6204636_MouseBrain_20um\", \"GSM6753041_MouseBrain_20um_repATAC\", \"GSM6753043_MouseBrain_20um_100barcodes_ATAC\"]\n", "label_list = [\"GSM6204623\", \"GSM6758284\", \"GSM6758285\"]\n", "slice_index_list = list(range(len(slice_name_list)))\n", "\n", "data_dir = '../../data/spMOdata/EpiTran_HumanMouse_Zhang2023/preprocessed_from_fragments/'\n", "save_dir = f'../../results/HumanMouse_Zhang2023/mb/'\n", "\n", "method = 'leiden'\n", "\n", "if not os.path.exists(save_dir + f'DEGs_concat/'):\n", " os.makedirs(save_dir + f'DEGs_concat/')\n", "for i in range(len(slice_name_list)):\n", " if not os.path.exists(save_dir + f'DEGs_slice{i}/'):\n", " os.makedirs(save_dir + f'DEGs_slice{i}/')" ] }, { "cell_type": "markdown", "id": "d20dd32c-27d6-4dfa-adc8-1829faad7bf9", "metadata": {}, "source": [ "### Load the spATAC-seq data and the corresponding SRT data" ] }, { "cell_type": "code", "execution_count": null, "id": "dee43d74-5a13-4a27-83fd-94acff8045d3", "metadata": {}, "outputs": [], "source": [ "# read the filtered and clustered CAS data\n", "cas_list = [ad.read_h5ad(save_dir + f'clustered_{sample}.h5ad') for sample in slice_name_list]\n", "\n", "# read the raw RNA data\n", "rna_list = [ad.read_h5ad(data_dir + f'{sample}.h5ad') for sample in rna_slice_name_list]\n", "for j in range(len(rna_list)):\n", " rna_list[j].obs_names = [x + '-1_' + slice_name_list[j] for x in rna_list[j].obs_names]\n", " print(rna_list[j].shape)\n", "\n", "# filter and reorder spots in rna slices\n", "for i in range(len(slice_name_list)):\n", " obs_list = [obs_name for obs_name in cas_list[i].obs_names if obs_name in rna_list[i].obs_names]\n", " cas_list[i] = cas_list[i][obs_list, :]\n", " rna_list[i] = rna_list[i][obs_list, :]\n", " # transfer the cluster labels from cas slices to rna slice\n", " rna_list[i].obs[method] = cas_list[i].obs[method].copy()\n", " print(rna_list[i].shape)" ] }, { "cell_type": "markdown", "id": "6ce13089-aec7-4dcf-bc97-fe2afb6daf42", "metadata": {}, "source": [ "### Normalization" ] }, { "cell_type": "code", "execution_count": null, "id": "60ec56cd-0f6a-497b-9bc8-982212a17e87", "metadata": {}, "outputs": [], "source": [ "# normalization\n", "for i in range(len(slice_name_list)):\n", " sc.pp.normalize_total(rna_list[i], target_sum=1e4)\n", " sc.pp.log1p(rna_list[i])\n", " rna_list[i].var_names_make_unique()\n", "\n", "# concatenate the rna slices\n", "rna_concat = ad.concat(rna_list, label='slice_name', keys=slice_name_list)\n", "print(rna_concat.shape)" ] }, { "cell_type": "markdown", "id": "32d83b27-7a4d-4c71-8fde-2f1e2efe5c8c", "metadata": {}, "source": [ "### Find DEGs" ] }, { "cell_type": "code", "execution_count": null, "id": "54e8901c-ae22-4fdf-ac86-96f72916f153", "metadata": {}, "outputs": [], "source": [ "# find DEGs\n", "group_list = list(set(rna_concat.obs[method]))\n", "print(group_list)\n", "\n", "rna_concat_degs_list = []\n", "sc.tl.rank_genes_groups(rna_concat, method, groups=group_list, method='wilcoxon')\n", "rna_concat_genes = pd.DataFrame(rna_concat.uns[\"rank_genes_groups\"][\"names\"])\n", "rna_concat_logfoldchanges = pd.DataFrame(rna_concat.uns[\"rank_genes_groups\"][\"logfoldchanges\"])\n", "rna_concat_pvals_adj = pd.DataFrame(rna_concat.uns[\"rank_genes_groups\"][\"pvals_adj\"])\n", "for col in list(rna_concat_genes.columns):\n", " concat_genes = rna_concat_genes[col].tolist()\n", " concat_logfoldchanges = rna_concat_logfoldchanges[col].tolist()\n", " concat_pvals_adj = rna_concat_pvals_adj[col].tolist()\n", " concat_degs_list = [concat_genes[i] for i in range(len(concat_genes)) if concat_logfoldchanges[i] > 1 and concat_pvals_adj[i] < 0.01]\n", " rna_concat_degs_list.append(concat_degs_list)\n", " # save DEGs\n", " if save:\n", " if not concat_degs_list:\n", " with open(save_dir + f'DEGs_concat/{col}_DEGs.txt', 'w') as f:\n", " pass\n", " else:\n", " with open(save_dir + f'DEGs_concat/{col}_DEGs.txt', 'w') as f:\n", " # f.write(','.join(concat_degs_list))\n", " for item in concat_degs_list:\n", " f.write(item + '\\n')\n", " print(f\"Label: {col}, Number of DEGs: {len(concat_degs_list)}\")\n", "\n", "\n", "# find DEGs slice specific\n", "for i in range(len(slice_name_list)):\n", "\n", " group_list = list(set(rna_list[i].obs[method]))\n", " print(group_list)\n", "\n", " rna_degs_list = []\n", " sc.tl.rank_genes_groups(rna_list[i], method, groups=group_list, method='wilcoxon')\n", " rna_genes = pd.DataFrame(rna_list[i].uns[\"rank_genes_groups\"][\"names\"])\n", " rna_logfoldchanges = pd.DataFrame(rna_list[i].uns[\"rank_genes_groups\"][\"logfoldchanges\"])\n", " rna_pvals_adj = pd.DataFrame(rna_list[i].uns[\"rank_genes_groups\"][\"pvals_adj\"])\n", " for col in list(rna_genes.columns):\n", " genes = rna_genes[col].tolist()\n", " logfoldchanges = rna_logfoldchanges[col].tolist()\n", " pvals_adj = rna_pvals_adj[col].tolist()\n", " degs_list = [genes[j] for j in range(len(genes)) if logfoldchanges[j] > 1 and pvals_adj[j] < 0.05]\n", " rna_degs_list.append(degs_list)\n", " # save DEGs\n", " if save:\n", " if not degs_list:\n", " with open(save_dir + f'DEGs_slice{i}/{col}_DEGs.txt', 'w') as f:\n", " pass\n", " else:\n", " with open(save_dir + f'DEGs_slice{i}/{col}_DEGs.txt', 'w') as f:\n", " # f.write(','.join(degs_list))\n", " for item in degs_list:\n", " f.write(item + '\\n')\n", " print(f\"Label: {col}, Number of DEGs: {len(degs_list)}\")" ] }, { "cell_type": "markdown", "id": "489f7504-acbc-4c3a-b6d6-cb4ef500675e", "metadata": {}, "source": [ "### Plot DEGs" ] }, { "cell_type": "code", "execution_count": null, "id": "c51b5689-17d3-43b3-94b2-d76bd0482c6f", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "save = True\n", "slice_name_list = [\"GSM6204623_MouseBrain_20um\", \"GSM6758284_MouseBrain_20um_repATAC\", \"GSM6758285_MouseBrain_20um_100barcodes_ATAC\"]\n", "rna_slice_name_list = [\"GSM6204636_MouseBrain_20um\", \"GSM6753041_MouseBrain_20um_repATAC\", \"GSM6753043_MouseBrain_20um_100barcodes_ATAC\"]\n", "label_list = [\"GSM6204623\", \"GSM6758284\", \"GSM6758285\"]\n", "slice_index_list = list(range(len(slice_name_list)))\n", "\n", "data_dir = '../../data/spMOdata/EpiTran_HumanMouse_Zhang2023/preprocessed_from_fragments/'\n", "save_dir = f'../../results/HumanMouse_Zhang2023/mb/'\n", "\n", "marker_list = ['Rgs9', 'Pde10a', 'Gng7', 'Bcl11b', 'Foxp1',\n", " 'Plp1', 'Mbp', 'Tspan2',\n", " 'Dgkg',\n", " 'Sox4', 'Dlx1', # 'Zbtb20',\n", " 'Isl1', 'Rreb1',\n", " 'Mef2c',\n", " ]\n", "\n", "marker = 'Plp1'\n", "if not os.path.exists(save_dir + f'{marker}/') and save:\n", " os.makedirs(save_dir + f'{marker}/')" ] }, { "cell_type": "code", "execution_count": null, "id": "b5f53e9b-4c56-46af-aadd-f5a376a7e88a", "metadata": {}, "outputs": [], "source": [ "# read the filtered and clustered CAS data\n", "cas_list = [ad.read_h5ad(save_dir + f'clustered_{sample}.h5ad') for sample in slice_name_list]\n", "cas_concat = ad.concat(cas_list, label='slice_name', keys=slice_name_list)\n", "cas_concat.obsm['X_pca'] = np.load(save_dir + f'input_matrix.npy')\n", "cas_concat.obsm['INSTINCT_latent'] = pd.read_csv(save_dir + f'INSTINCT_embed.csv', header=None).values\n", "cas_concat.obsm['raw_emb'] = pd.read_csv(save_dir + f'sp_embeddings_raw.csv', header=None).values\n", "cas_concat.obsm['inte_emb'] = pd.read_csv(save_dir + f'sp_embeddings_integrated.csv', header=None).values\n", "\n", "# read the raw RNA data\n", "rna_list = [ad.read_h5ad(data_dir + f'{sample}.h5ad') for sample in rna_slice_name_list]\n", "for j in range(len(rna_list)):\n", " rna_list[j].obs_names = [x + '-1_' + slice_name_list[j] for x in rna_list[j].obs_names]\n", " print(rna_list[j].shape)\n", "\n", "# filter and reorder spots in rna slices\n", "for i in range(len(slice_name_list)):\n", " obs_list = [obs_name for obs_name in cas_list[i].obs_names if obs_name in rna_list[i].obs_names]\n", " cas_list[i] = cas_list[i][obs_list, :]\n", " rna_list[i] = rna_list[i][obs_list, :]\n", " print(rna_list[i].shape)\n", "\n", "# concatenate the rna slices\n", "rna_concat = ad.concat(rna_list, label='slice_name', keys=slice_name_list)\n", "sc.pp.filter_genes(rna_concat, min_cells=500)\n", "sc.pp.normalize_total(rna_concat, target_sum=1e4)\n", "sc.pp.log1p(rna_concat)\n", "print(rna_concat.shape)\n", "cas_concat = cas_concat[rna_concat.obs_names, :]\n", "rna_concat.obsm['X_pca'] = cas_concat.obsm['X_pca']\n", "rna_concat.obsm['INSTINCT_latent'] = cas_concat.obsm['INSTINCT_latent']\n", "rna_concat.obsm['raw_emb'] = cas_concat.obsm['raw_emb']\n", "rna_concat.obsm['inte_emb'] = cas_concat.obsm['inte_emb']\n", "\n", "data_column = rna_concat[:, marker].copy()\n", "\n", "# raw\n", "sp_embedding = rna_concat.obsm['raw_emb'].copy()\n", "n_spots = rna_concat.shape[0]\n", "size = 10000 / n_spots\n", "order = np.arange(n_spots)\n", "plt.figure(figsize=(6, 5))\n", "plt.scatter(sp_embedding[order, 0], sp_embedding[order, 1], s=size, c=data_column.X.toarray(), cmap='viridis')\n", "plt.tick_params(axis='both', bottom=False, top=False, left=False, right=False,\n", " labelleft=False, labelbottom=False, grid_alpha=0)\n", "plt.colorbar(label='Color')\n", "\n", "plt.title(f'Raw', fontsize=16)\n", "if save:\n", " save_path = save_dir + f\"{marker}/{marker}_raw_umap.pdf\"\n", " plt.savefig(save_path)\n", "\n", "# integrated\n", "sp_embedding = rna_concat.obsm['inte_emb'].copy()\n", "n_spots = rna_concat.shape[0]\n", "size = 10000 / n_spots\n", "order = np.arange(n_spots)\n", "plt.figure(figsize=(6, 5))\n", "plt.scatter(sp_embedding[order, 0], sp_embedding[order, 1], s=size, c=data_column.X.toarray(), cmap='viridis')\n", "plt.tick_params(axis='both', bottom=False, top=False, left=False, right=False,\n", " labelleft=False, labelbottom=False, grid_alpha=0)\n", "plt.colorbar(label='Color')\n", "\n", "plt.title(f'Integrated', fontsize=16)\n", "if save:\n", " save_path = save_dir + f\"{marker}/{marker}_integrated_umap.pdf\"\n", " plt.savefig(save_path)\n", "\n", "scalar_mappable = plt.cm.ScalarMappable(cmap='viridis')\n", "colors = scalar_mappable.to_rgba(data_column.X.toarray())\n", "\n", "spots_count = [0]\n", "n = 0\n", "for sample in rna_list:\n", " num = sample.shape[0]\n", " n += num\n", " spots_count.append(n)\n", "\n", "# fig, axs = plt.subplots(1, 2, figsize=(8, 4))\n", "fig, axs = plt.subplots(1, 3, figsize=(12, 4))\n", "for i in range(len(rna_list)):\n", " cluster_colors = list(colors[spots_count[i]: spots_count[i+1]])\n", " if i == 2:\n", " s = 10\n", " else:\n", " s = 40\n", " axs[i].scatter(rna_list[i].obsm['spatial'][:, 1], rna_list[i].obsm['spatial'][:, 0], linewidth=1, s=s,\n", " marker=\".\", color=cluster_colors, alpha=0.9)\n", " if i != 1:\n", " axs[i].invert_yaxis()\n", " if i != 0:\n", " axs[i].invert_xaxis()\n", " axs[i].set_title(f'{label_list[i]}', size=12)\n", " axs[i].axis('off')\n", "# plt.colorbar(label='Color')\n", "plt.gcf().subplots_adjust(left=0.05, top=0.8, bottom=0.05, right=0.90)\n", "if save:\n", " save_path = save_dir + f'{marker}/{marker}_slices.pdf'\n", " plt.savefig(save_path)\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 5 }