{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "41e8ad90-f57b-4f8c-9048-d129395e15c5",
   "metadata": {},
   "source": [
    "# Integrating the three slices from the spatial-ATAC-seq ME dataset\n",
    "This dataset contains one slice from the E11 developmental stage and two slices from the E13 stage, with the sequencing depth of the ME13_1 slice being significantly higher than that of the other two slices, resulting in over 590,000 peaks and significantly higher fragment counts. The other two slices have only about 290,000 peaks, but higher ratio of reads in transcription start sites (TSS) compared to the ME13_1 slice."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0a29fa54-1246-4595-b4e2-d81df2307c38",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import csv\n",
    "import torch\n",
    "import numpy as np\n",
    "import anndata as ad\n",
    "\n",
    "from sklearn.decomposition import PCA\n",
    "\n",
    "import INSTINCT\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f96a6d57-e6b5-45e5-8b92-a03819d6b1e4",
   "metadata": {},
   "source": [
    "### Load the raw data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fc72aac6-4adc-4fda-908c-22ee4fb2d10e",
   "metadata": {},
   "outputs": [],
   "source": [
    "save = True\n",
    "\n",
    "data_dir = '../../data/spCASdata/HumanMouse_Deng2022/preprocessed/'\n",
    "save_dir = '../../results/HumanMouse_Deng2022/'\n",
    "slice_name_list = ['GSM5238385_ME11_50um', 'GSM5238386_ME13_50um', 'GSM5238387_ME13_50um_2']\n",
    "slice_used = [0, 1, 2]\n",
    "slice_name_list = [slice_name_list[i] for i in slice_used]\n",
    "slice_index_list = list(range(len(slice_name_list)))\n",
    "\n",
    "save_dir = f'../../results/HumanMouse_Deng2022/{slice_used}/'\n",
    "if not os.path.exists(data_dir + f'{slice_used}/'):\n",
    "    os.makedirs(data_dir + f'{slice_used}/')\n",
    "if not os.path.exists(save_dir):\n",
    "    os.makedirs(save_dir)\n",
    "\n",
    "# load raw data\n",
    "cas_list = []\n",
    "for sample in slice_name_list:\n",
    "    sample_data = ad.read_h5ad(data_dir + sample + '.h5ad')\n",
    "\n",
    "    if 'insertion' in sample_data.obsm:\n",
    "        del sample_data.obsm['insertion']\n",
    "\n",
    "    cas_list.append(sample_data)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8003f313-cad7-451f-bf26-23bb69e4bc0b",
   "metadata": {},
   "source": [
    "### Merge the peaks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a43592f8-8ea3-4c3b-b147-2e6dbe32675a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# merge peaks\n",
    "cas_list = INSTINCT.peak_sets_alignment(cas_list)\n",
    "\n",
    "# save the merged data\n",
    "for idx, adata in enumerate(cas_list):\n",
    "    adata.write_h5ad(data_dir + f'{slice_used}/merged_{slice_name_list[idx]}.h5ad')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "da8bd913-47bd-4300-9429-7ed51bca623b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load the merged data\n",
    "cas_list = [ad.read_h5ad(data_dir + f'{slice_used}/merged_{sample}.h5ad') for sample in slice_name_list]\n",
    "for j in range(len(cas_list)):\n",
    "    cas_list[j].obs_names = [x + '_' + slice_name_list[j] for x in cas_list[j].obs_names]\n",
    "    if 'in_tissue' in cas_list[j].obs.keys():\n",
    "        cas_list[j] = cas_list[j][cas_list[j].obs['in_tissue'] == 1, :]\n",
    "\n",
    "# concatenation\n",
    "adata_concat = ad.concat(cas_list, label=\"slice_name\", keys=slice_name_list)\n",
    "# adata_concat.obs_names_make_unique()\n",
    "print(adata_concat.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6ce0dd43-a9d1-4c33-ad88-e59ec0376069",
   "metadata": {},
   "source": [
    "### Data preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ca16d1f2-0b60-49a4-ba76-89714871bee4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# preprocess CAS data\n",
    "print('Start preprocessing')\n",
    "INSTINCT.preprocess_CAS(cas_list, adata_concat, use_fragment_count=True, min_cells_rate=0.05)\n",
    "print(adata_concat.shape)\n",
    "print('Done!')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c0d5055d-c295-4ea8-99d0-d8d47c058dae",
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_concat.write_h5ad(save_dir + f\"preprocessed_concat.h5ad\")\n",
    "for i in range(len(slice_name_list)):\n",
    "    cas_list[i].write_h5ad(save_dir + f\"filtered_merged_{slice_name_list[i]}.h5ad\")\n",
    "\n",
    "cas_list = [ad.read_h5ad(save_dir + f\"filtered_merged_{sample}.h5ad\") for sample in slice_name_list]\n",
    "adata_concat = ad.read_h5ad(save_dir + f\"preprocessed_concat.h5ad\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e578138b-b8b7-4ed7-8917-2f4fc60a1865",
   "metadata": {},
   "source": [
    "### Perform PCA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6b3d04ec-d6b0-4783-b93e-b63a588376d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f'Applying PCA to reduce the feature dimension to 100 ...')\n",
    "pca = PCA(n_components=100, random_state=1234)\n",
    "input_matrix = pca.fit_transform(adata_concat.X.toarray())\n",
    "np.save(save_dir + f'input_matrix.npy', input_matrix)\n",
    "print('Done !')\n",
    "\n",
    "input_matrix = np.load(save_dir + f'input_matrix.npy')\n",
    "adata_concat.obsm['X_pca'] = input_matrix"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "42f3889e-7883-4450-b16b-6d4dfad38c19",
   "metadata": {},
   "source": [
    "### Create neighbor graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d6c14c80-fbba-4343-982d-554f5851565c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# calculate the spatial graph\n",
    "INSTINCT.create_neighbor_graph(cas_list, adata_concat)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "108b2dd7-684b-4716-bdd1-e6b2a1967633",
   "metadata": {},
   "source": [
    "### Run model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "335899cf-3ee6-4971-8514-69bf447d2b7f",
   "metadata": {},
   "outputs": [],
   "source": [
    "INSTINCT_model = INSTINCT.INSTINCT_Model(cas_list, adata_concat, device=device)\n",
    "\n",
    "INSTINCT_model.train(report_loss=True, report_interval=100)\n",
    "\n",
    "INSTINCT_model.eval(cas_list)\n",
    "\n",
    "result = ad.concat(cas_list, label=\"slice_idx\", keys=slice_index_list)\n",
    "\n",
    "if save:\n",
    "    with open(save_dir + f'INSTINCT_embed.csv', 'w', newline='') as file:\n",
    "        writer = csv.writer(file)\n",
    "        writer.writerows(result.obsm['INSTINCT_latent'])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}