{ "cells": [ { "cell_type": "markdown", "id": "987c13ae-983d-487a-9ba0-9e28b379e293", "metadata": {}, "source": [ "# Find cluster-specific peaks across the three slices" ] }, { "cell_type": "code", "execution_count": null, "id": "b07c1bb6-5cbf-4b5d-951d-38f955741ff2", "metadata": {}, "outputs": [], "source": [ "import os\n", "import numpy as np\n", "import anndata as ad\n", "import scanpy as sc\n", "import pandas as pd\n", "import scipy\n", "\n", "import INSTINCT\n", "\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")" ] }, { "cell_type": "markdown", "id": "94d23e01-fe15-4f35-b011-f80753f3eac0", "metadata": {}, "source": [ "### Load clustered data" ] }, { "cell_type": "code", "execution_count": null, "id": "fa3c9132-c853-41f1-9901-8a8d54efd1b6", "metadata": {}, "outputs": [], "source": [ "save = True\n", "n_peaks = 1000\n", "model = 'INSTINCT'\n", "method = 'leiden'\n", "\n", "data_dir = '../../data/spCASdata/HumanMouse_Deng2022/preprocessed/'\n", "save_dir = '../../results/HumanMouse_Deng2022/'\n", "slice_name_list = ['GSM5238385_ME11_50um', 'GSM5238386_ME13_50um', 'GSM5238387_ME13_50um_2']\n", "label_list = ['GSM5238385', 'GSM5238386', 'GSM5238387']\n", "slice_used = [0, 1, 2]\n", "slice_name_list = [slice_name_list[i] for i in slice_used]\n", "label_list = [label_list[i] for i in slice_used]\n", "slice_index_list = list(range(len(slice_name_list)))\n", "\n", "save_dir = f'../../results/HumanMouse_Deng2022/{slice_used}/'\n", "\n", "if not os.path.exists(save_dir + f'concat/peaks/'):\n", " os.makedirs(save_dir + f'concat/peaks/')\n", "\n", "cas_list = [ad.read_h5ad(save_dir + f'clustered_{sample}.h5ad') for i, sample in enumerate(slice_name_list)]\n", "adata_concat = ad.concat(cas_list, label=\"slice_name\", keys=label_list)\n", "print(adata_concat.shape)" ] }, { "cell_type": "markdown", "id": "b046d857-c415-4c15-af5c-d68424b18544", "metadata": {}, "source": [ "### Data preprocessing" ] }, { "cell_type": "code", "execution_count": null, "id": "06c9a4d3-319b-4dbe-a2cd-faf74d152739", "metadata": {}, "outputs": [], "source": [ "adata_concat.X = scipy.sparse.csr_matrix(np.ceil((adata_concat.X / 2).toarray()))\n", "adata_concat.X = INSTINCT.TFIDF(adata_concat.X.T, type_=2).T.copy()" ] }, { "cell_type": "markdown", "id": "c8c116b4-d108-453f-b8ed-01ad6c58a87f", "metadata": {}, "source": [ "### Find cluster-specific peaks" ] }, { "cell_type": "code", "execution_count": null, "id": "c9edcf6a-eafd-4ead-89e1-f96e4125c09c", "metadata": {}, "outputs": [], "source": [ "# find cluster specific peaks\n", "group_list = list(set(adata_concat.obs[method]))\n", "print(group_list)\n", "\n", "sc.tl.rank_genes_groups(adata_concat, method, groups=group_list, method='wilcoxon')\n", "peaks_list = []\n", "seen_peaks = set()\n", "\n", "cas_peaks = pd.DataFrame(adata_concat.uns[\"rank_genes_groups\"][\"names\"])\n", "cas_logfoldchanges = pd.DataFrame(adata_concat.uns[\"rank_genes_groups\"][\"logfoldchanges\"])\n", "cas_pvals_adj = pd.DataFrame(adata_concat.uns[\"rank_genes_groups\"][\"pvals_adj\"])\n", "\n", "for col in list(cas_peaks.columns):\n", "\n", " peaks = cas_peaks[col].tolist()\n", " logfoldchanges = cas_logfoldchanges[col].tolist()\n", " pvals_adj = cas_pvals_adj[col].tolist()\n", "\n", " peaks_filtered = [peaks[j] for j in range(len(peaks)) if logfoldchanges[j] > 1]\n", " pvals_adj_filtered = [pvals_adj[j] for j in range(len(pvals_adj)) if logfoldchanges[j] > 1]\n", " print(len(peaks_filtered))\n", "\n", " if len(peaks_filtered) <= n_peaks:\n", " selected_peaks = peaks_filtered\n", " else:\n", " min_indices = np.argsort(pvals_adj_filtered)[:n_peaks]\n", " selected_peaks = [peaks_filtered[j] for j in min_indices]\n", " # save peaks\n", " if save:\n", " if not selected_peaks:\n", " with open(save_dir + f'concat/peaks/{col}_specific_peaks.txt', 'w') as f:\n", " pass\n", " else:\n", " with open(save_dir + f'concat/peaks/{col}_specific_peaks.txt', 'w') as f:\n", " for item in selected_peaks:\n", " f.write(item + '\\n')\n", " for peak in selected_peaks:\n", " if peak not in seen_peaks:\n", " peaks_list.append(peak)\n", " seen_peaks.add(peak)\n", " print(f\"Label: {col}, Number of specific peaks: {len(selected_peaks)}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "38e21999-9be1-4fc1-a900-e9625e592dfd", "metadata": {}, "outputs": [], "source": [ "# save the data with selected peaks\n", "sorted_peaks = [peak for peak in adata_concat.var_names if peak in peaks_list]\n", "print(len(sorted_peaks))\n", "if save:\n", " with open(save_dir + f'concat/peaks/all_specific_peaks.txt', 'w') as f:\n", " for item in sorted_peaks:\n", " f.write(item + '\\n')\n", " adata_concat = ad.concat(cas_list, label=\"slice_name\", keys=label_list)\n", " adata_concat = adata_concat[adata_concat.obs_names, sorted_peaks]\n", " adata_concat.X = scipy.sparse.csr_matrix(np.ceil((adata_concat.X / 2).toarray()))\n", " adata_concat.X = INSTINCT.TFIDF(adata_concat.X.T, type_=2).T.copy()\n", " print(adata_concat.shape)\n", " adata_concat.write_h5ad(save_dir + f'concat/selected_concat.h5ad')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 5 }