{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "71d217ff-59c6-41a8-b3cf-6334b2411f9d",
   "metadata": {},
   "source": [
    "# Integrating two vertically adjacent slices from the same developmental stage\n",
    "In this case, we integrate the two vertically adjacent slices from the same developmental stage and prepare for cross-sample annotation and downstream analysis."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "181b4345-0499-40e9-a56d-ca685287fb32",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import csv\n",
    "import torch\n",
    "import numpy as np\n",
    "import anndata as ad\n",
    "\n",
    "from sklearn.decomposition import PCA\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
    "\n",
    "import INSTINCT"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e1b83e3c-56e0-4f7d-b08a-f5bf531f28ed",
   "metadata": {},
   "source": [
    "### Load the raw data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d4fbbbfb-a257-4263-9bab-0ab7900b7b0d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# set the mode_index to the index of a specific stage in the mode_list\n",
    "# mode_index = 3 means integrating the two slices from E18.5\n",
    "mode_index = 3\n",
    "mode_list = ['E11_0', 'E13_5', 'E15_5', 'E18_5']\n",
    "mode = mode_list[mode_index]\n",
    "\n",
    "# mouse brain\n",
    "data_dir = '../../data/spMOdata/EpiTran_MouseBrain_Jiang2023/preprocessed/'\n",
    "if not os.path.exists(data_dir + f'{mode}/'):\n",
    "    os.makedirs(data_dir + f'{mode}/')\n",
    "save_dir = f'../../results/MouseBrain_Jiang2023/vertical/{mode}/'\n",
    "if not os.path.exists(save_dir + 'INSTINCT/'):\n",
    "    os.makedirs(save_dir + 'INSTINCT/')\n",
    "slice_name_list = [f'{mode}-S1', f'{mode}-S2']\n",
    "\n",
    "# load raw data\n",
    "cas_dict = {}\n",
    "for sample in slice_name_list:\n",
    "    sample_data = ad.read_h5ad(data_dir + sample + '_atac.h5ad')\n",
    "\n",
    "    if 'insertion' in sample_data.obsm:\n",
    "        del sample_data.obsm['insertion']\n",
    "\n",
    "    cas_dict[sample] = sample_data\n",
    "cas_list = [cas_dict[sample] for sample in slice_name_list]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3e730663-3319-439f-8405-f471538cb08b",
   "metadata": {},
   "source": [
    "### Merge the peaks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "609a506d-23a3-452c-ba36-53171f318fa4",
   "metadata": {},
   "outputs": [],
   "source": [
    "cas_list = INSTINCT.peak_sets_alignment(cas_list)\n",
    "\n",
    "# save the merged data\n",
    "for idx, adata in enumerate(cas_list):\n",
    "    adata.write_h5ad(f'{data_dir}{mode}/merged_{slice_name_list[idx]}_atac.h5ad')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "547a0d5a-597b-4c9f-a66a-ed0c899ab89c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load the merged data\n",
    "cas_list = [ad.read_h5ad(data_dir + mode + '/merged_' + sample + '_atac.h5ad') for sample in slice_name_list]\n",
    "for j in range(len(cas_list)):\n",
    "    cas_list[j].obs_names = [x + '_' + slice_name_list[j] for x in cas_list[j].obs_names]\n",
    "\n",
    "# concatenation\n",
    "adata_concat = ad.concat(cas_list, label=\"slice_name\", keys=slice_name_list)\n",
    "# adata_concat.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c478c324-f71d-4ab0-b259-4def6d7094a5",
   "metadata": {},
   "source": [
    "### Data preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20acf3c9-197f-4afd-8b92-ff7c81dfbfbd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# preprocess CAS data\n",
    "print('Start preprocessing')\n",
    "INSTINCT.preprocess_CAS(cas_list, adata_concat, use_fragment_count=True, min_cells_rate=0.03)\n",
    "print(adata_concat.shape)\n",
    "print('Done!')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "92aaf58a-cb9a-47bb-8af9-2ce16412cd05",
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_concat.write_h5ad(save_dir + f\"{mode}_preprocessed_concat_atac.h5ad\")\n",
    "for i in range(len(slice_name_list)):\n",
    "    cas_list[i].write_h5ad(save_dir + f\"filtered_merged_{slice_name_list[i]}_atac.h5ad\")\n",
    "\n",
    "cas_list = [ad.read_h5ad(save_dir + f\"filtered_merged_{sample}_atac.h5ad\") for sample in slice_name_list]\n",
    "# origin_concat = ad.concat(cas_list, label=\"slice_name\", keys=slice_name_list)\n",
    "adata_concat = ad.read_h5ad(save_dir + f\"{mode}_preprocessed_concat_atac.h5ad\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4c8602eb-c71e-442f-8068-c77e860c5b14",
   "metadata": {},
   "source": [
    "### Perform PCA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "08965fbe-c0bc-446c-bd74-aebc17623539",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f'Applying PCA to reduce the feature dimension to 100 ...')\n",
    "pca = PCA(n_components=100, random_state=1234)\n",
    "input_matrix = pca.fit_transform(adata_concat.X.toarray())\n",
    "np.save(save_dir + f'{mode}_input_matrix_atac.npy', input_matrix)\n",
    "print('Done !')\n",
    "\n",
    "input_matrix = np.load(save_dir + f'{mode}_input_matrix_atac.npy')\n",
    "adata_concat.obsm['X_pca'] = input_matrix"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5dd41a21-af44-421b-baa7-bdead21e8c59",
   "metadata": {},
   "source": [
    "### Create neighbor graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5a1e8aa0-3b56-4545-8c1a-9698087b74e8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# calculate the spatial graph\n",
    "INSTINCT.create_neighbor_graph(cas_list, adata_concat)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8efdbf8b-0d9b-4106-9716-240e09f66ab7",
   "metadata": {},
   "source": [
    "### Run model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8493c9f9-34ba-461a-9dab-75006d8d2342",
   "metadata": {},
   "outputs": [],
   "source": [
    "INSTINCT_model = INSTINCT.INSTINCT_Model(cas_list, adata_concat, device=device)\n",
    "\n",
    "INSTINCT_model.train(report_loss=True, report_interval=100)\n",
    "\n",
    "INSTINCT_model.eval(cas_list)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ddb93ffd-612f-4bea-8c29-841c80ff0992",
   "metadata": {},
   "source": [
    "### Save the latent embeddings\n",
    "Save the latent representations of spots for further analyses"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0ffa2926-ee3c-4e32-87e8-8354dd32889b",
   "metadata": {},
   "outputs": [],
   "source": [
    "result = ad.concat(cas_list, label=\"slice_name\", keys=slice_name_list)\n",
    "\n",
    "with open(save_dir + f'INSTINCT/{mode}_INSTINCT_embed.csv', 'w', newline='') as file:\n",
    "    writer = csv.writer(file)\n",
    "    writer.writerows(result.obsm['INSTINCT_latent'])\n",
    "\n",
    "with open(save_dir + f'INSTINCT/{mode}_INSTINCT_noise_embed.csv', 'w', newline='') as file:\n",
    "    writer = csv.writer(file)\n",
    "    writer.writerows(result.obsm['INSTINCT_latent_noise'])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}