{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "31ddc1c1-e45b-454f-a564-7f1e6fe05207",
   "metadata": {},
   "source": [
    "# Integrating three slices from spatial ATAC-RNA-seq MB dataset\n",
    "Utilize INSTINCT to integrate three samples of mouse postnatal day 21/22 (P21/22) brains generated by spatial ATAC-RNA-seq (spatial ATAC-RNA-seq MB), a multi-omics sequencing technique.  \n",
    "It is worth noting that although these three slices were sequenced from similar developmental stages of the same organ, the first two slices (slice 0 and 1) have a size of  barcodes, while the third slice (slice 2) contains  barcodes.  \n",
    "This resulted in the sequencing of brain tissues of different scales, with slice 2 essentially encompassing the entire hemisphere of the coronal brain, while slice 0 and 1 only contained approximately one-quarter of the size. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e6c5691-770b-466c-b550-15be82217191",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import anndata as ad\n",
    "import numpy as np\n",
    "import torch\n",
    "import csv\n",
    "\n",
    "from sklearn.decomposition import PCA\n",
    "import INSTINCT\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8a918c30-3d27-4506-a50a-b584aa0b6f6e",
   "metadata": {},
   "source": [
    "### Load the raw data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d46df7a8-e2bd-4a3e-a9d9-354f2469f30e",
   "metadata": {},
   "outputs": [],
   "source": [
    "slice_name_list = [\"GSM6204623_MouseBrain_20um\", \"GSM6758284_MouseBrain_20um_repATAC\", \"GSM6758285_MouseBrain_20um_100barcodes_ATAC\"]\n",
    "rna_slice_name_list = [\"GSM6204636_MouseBrain_20um\", \"GSM6753041_MouseBrain_20um_repATAC\", \"GSM6753043_MouseBrain_20um_100barcodes_ATAC\"]\n",
    "slice_index_list = list(range(len(slice_name_list)))\n",
    "\n",
    "data_dir = '../../data/spMOdata/EpiTran_HumanMouse_Zhang2023/preprocessed_from_fragments/'\n",
    "save_dir = f'../../results/HumanMouse_Zhang2023/mb/'\n",
    "\n",
    "if not os.path.exists(data_dir + f'mb_merged/'):\n",
    "    os.makedirs(data_dir + f'mb_merged/')\n",
    "if not os.path.exists(save_dir):\n",
    "    os.makedirs(save_dir)\n",
    "\n",
    "# load raw data\n",
    "cas_list = []\n",
    "for sample in slice_name_list:\n",
    "    sample_data = ad.read_h5ad(data_dir + sample + '.h5ad')\n",
    "\n",
    "    if 'insertion' in sample_data.obsm:\n",
    "        del sample_data.obsm['insertion']\n",
    "\n",
    "    cas_list.append(sample_data)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cba40d92-0127-4ef6-9263-4862ba34310a",
   "metadata": {},
   "source": [
    "### Merge the peaks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fa2772a0-bcb0-4ca2-9914-94c6861bf887",
   "metadata": {},
   "outputs": [],
   "source": [
    "cas_list = INSTINCT.peak_sets_alignment(cas_list)\n",
    "\n",
    "# save the merged data\n",
    "for idx, adata in enumerate(cas_list):\n",
    "    adata.write_h5ad(data_dir + f'mb_merged/merged_{slice_name_list[idx]}.h5ad')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a073871b-ae69-46b2-a493-eee4ec29442d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load the merged data\n",
    "cas_list = [ad.read_h5ad(data_dir + f'mb_merged/merged_{sample}.h5ad') for sample in slice_name_list]\n",
    "for j in range(len(cas_list)):\n",
    "    cas_list[j].obs_names = [x + '_' + slice_name_list[j] for x in cas_list[j].obs_names]\n",
    "\n",
    "# read the raw RNA data\n",
    "rna_list = [ad.read_h5ad(data_dir + f'{sample}.h5ad') for sample in rna_slice_name_list]\n",
    "for j in range(len(rna_list)):\n",
    "    rna_list[j].obs_names = [x + '-1_' + slice_name_list[j] for x in rna_list[j].obs_names]\n",
    "    print(rna_list[j].shape)\n",
    "\n",
    "# filter spots that is not tissue\n",
    "for i in range(len(slice_name_list)):\n",
    "    obs_list = [obs_name for obs_name in cas_list[i].obs_names if obs_name in rna_list[i].obs_names]\n",
    "    cas_list[i] = cas_list[i][obs_list, :]\n",
    "    print(cas_list[i].shape)\n",
    "\n",
    "# concatenation\n",
    "adata_concat = ad.concat(cas_list, label=\"slice_name\", keys=slice_name_list)\n",
    "# adata_concat.obs_names_make_unique()\n",
    "print(adata_concat.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "215f2ba4-c524-4f8b-9c63-5c52bb22174f",
   "metadata": {},
   "source": [
    "### Data preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cd7b1605-ce32-49a8-b5fc-9822d049666f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# preprocess CAS data\n",
    "print('Start preprocessing')\n",
    "INSTINCT.preprocess_CAS(cas_list, adata_concat, use_fragment_count=True, min_cells_rate=0.02)\n",
    "print(adata_concat.shape)\n",
    "print('Done!')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c59b125d-0611-4989-a814-34a9b3795737",
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_concat.write_h5ad(save_dir + f\"preprocessed_concat.h5ad\")\n",
    "for i in range(len(slice_name_list)):\n",
    "    cas_list[i].write_h5ad(save_dir + f\"filtered_merged_{slice_name_list[i]}.h5ad\")\n",
    "\n",
    "cas_list = [ad.read_h5ad(save_dir + f\"filtered_merged_{sample}.h5ad\") for sample in slice_name_list]\n",
    "# origin_concat = ad.concat(cas_list, label=\"slice_idx\", keys=slice_index_list)\n",
    "adata_concat = ad.read_h5ad(save_dir + f\"preprocessed_concat.h5ad\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c818625f-f1ab-436e-8718-338f5db448dd",
   "metadata": {},
   "source": [
    "### Perform PCA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0767875b-7989-4d8b-b8f5-fc3671d7049d",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f'Applying PCA to reduce the feature dimension to 100 ...')\n",
    "pca = PCA(n_components=100, random_state=1234)\n",
    "input_matrix = pca.fit_transform(adata_concat.X.toarray())\n",
    "np.save(save_dir + 'input_matrix.npy', input_matrix)\n",
    "print('Done !')\n",
    "\n",
    "input_matrix = np.load(save_dir + 'input_matrix.npy')\n",
    "adata_concat.obsm['X_pca'] = input_matrix"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "434de957-dc51-4f27-9d31-c06c654c5b60",
   "metadata": {},
   "source": [
    "### Create neighbor graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e52a5727-027b-412b-af28-474e7e305362",
   "metadata": {},
   "outputs": [],
   "source": [
    "# calculate the spatial graph\n",
    "INSTINCT.create_neighbor_graph(cas_list, adata_concat)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a3455b98-610f-4073-910c-7b04aaa98c1a",
   "metadata": {},
   "source": [
    "### Data integration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d7407a51-368f-43ff-9caa-eb903011122f",
   "metadata": {},
   "outputs": [],
   "source": [
    "INSTINCT_model = INSTINCT.INSTINCT_Model(cas_list, adata_concat, device=device)\n",
    "\n",
    "INSTINCT_model.train(report_loss=True, report_interval=100)\n",
    "\n",
    "INSTINCT_model.eval(cas_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0d8f7251-a8b6-4dd1-bab5-887ae1e2dfac",
   "metadata": {},
   "outputs": [],
   "source": [
    "result = ad.concat(cas_list, label=\"slice_idx\", keys=slice_index_list)\n",
    "\n",
    "with open(save_dir + 'INSTINCT_embed.csv', 'w', newline='') as file:\n",
    "    writer = csv.writer(file)\n",
    "    writer.writerows(result.obsm['INSTINCT_latent'])\n",
    "\n",
    "with open(save_dir + 'INSTINCT_noise_embed.csv', 'w', newline='') as file:\n",
    "    writer = csv.writer(file)\n",
    "    writer.writerows(result.obsm['INSTINCT_latent_noise'])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}