{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "4c02d766-2ca2-4ae1-8ef8-7a8498c1b6a4",
   "metadata": {},
   "source": [
    "# Expression enrichment analysis\n",
    "Identify the spot-type-specific peaks for each annotated spot-type, using the remaining peaks as background peaks, and performed expression enrichment analysis for the single nucleotide polymorphisms (SNPs) in each set of spot-type-specific peaks and the set of background peaks, respectively"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "76321e73-9a27-4a13-80d9-2b4a1b0a7e5c",
   "metadata": {},
   "outputs": [],
   "source": [
    "options(warn=-1)\n",
    "library(scABC)\n",
    "library(chromVAR)\n",
    "library(motifmatchr)\n",
    "library(SummarizedExperiment)\n",
    "library(Matrix)\n",
    "# library(ggplot2)\n",
    "library(BiocParallel)\n",
    "library(BSgenome.Hsapiens.UCSC.hg19)\n",
    "library(JASPAR2016)\n",
    "library(data.table)\n",
    "library(parallel)\n",
    "set.seed(1234)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "461247b4-9a50-4b46-af19-9d6939ebf774",
   "metadata": {},
   "source": [
    "### Untar the downloaded reference data\n",
    "The reference data can be downloaded [here](https://doi.org/10.5281/zenodo.7768714)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "54f21263-a78d-429e-938c-b38733274b11",
   "metadata": {},
   "outputs": [],
   "source": [
    "archive_file <- \"C:/University/FINAL/data/SNP_files/1000G_Phase3_plinkfiles.tgz\"\n",
    "extract_dir <- \"C:/University/FINAL/data/SNP_files/\"\n",
    "untar(archive_file, exdir = extract_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a3063233-bdb2-4568-a56a-63fccdfbd332",
   "metadata": {},
   "outputs": [],
   "source": [
    "# find and return indices for overlaps\n",
    "find.index <- function(df1,df2,type='reg'){\n",
    "    #colnames(df1) <- colnames(df2) <- c('V1','V2','V3')\n",
    "    library(GenomicRanges)\n",
    "    df1.gr = GRanges (IRanges(start = df1$V2, end = df1$V3), seqnames=df1$V1)\n",
    "    if(type=='reg'){\n",
    "        df2.gr = GRanges(IRanges(start=df2$V2, end = df2$V3), seqnames = df2$V1)\n",
    "    }\n",
    "    if(type=='pos'){\n",
    "        df2.gr = GRanges(IRanges(start=df2$V4, end = df2$V4), seqnames = df2$V1)\n",
    "    }\n",
    "    df1.git  = GNCList(df1.gr)\n",
    "    df2.git  = GNCList(df2.gr)\n",
    "    overlap_git = findOverlaps(df2.git, df1.git)\n",
    "    overlap_git\n",
    "    temp <- as.data.frame(overlap_git)\n",
    "    colnames(temp) <- c('df2','df1')\n",
    "    return(temp)\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "17139851-58f9-4476-b145-d4a6ea5678ba",
   "metadata": {},
   "source": [
    "### Find SNPs for each set of specific peaks\n",
    "The peaks should be first map to GRCh37/hg19 from mm10 using [LiftOver](http://genome.ucsc.edu/) and named accordingly."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "33c29569-7677-40fc-a335-91abaf433d33",
   "metadata": {},
   "outputs": [],
   "source": [
    "mode_list <- c('E11_0', 'E13_5', 'E15_5', 'E18_5')\n",
    "mode_index <- 4\n",
    "mode = mode_list[mode_index]\n",
    "\n",
    "\n",
    "# Load data\n",
    "data_path <- \"C:/University/FINAL/data/SNP_files/\"\n",
    "for(chr in 1:22){\n",
    "    if(chr==1){\n",
    "        # https://doi.org/10.5281/zenodo.7768714 -> 1000G_Phase3_plinkfiles.tgz\n",
    "        bim <- fread(paste0(data_path, '1000G_EUR_Phase3_plink/1000G.EUR.QC.1.bim'))\n",
    "    }else{\n",
    "        bim <- rbind(bim,fread(paste0(data_path, '1000G_EUR_Phase3_plink/1000G.EUR.QC.',chr,'.bim')))\n",
    "    }\n",
    "} ## 1000G bim file\n",
    "bim$V1 <- paste0('chr',bim$V1)\n",
    "\n",
    "\n",
    "inputpath <- paste(\"C:/University/FINAL/results/MouseBrain_Jiang2023/vertical\", mode, 'INSTINCT/GRCh37hg19/S2/', sep = \"/\")\n",
    "outputpath <- paste(\"C:/University/FINAL/results/MouseBrain_Jiang2023/vertical\", mode, 'INSTINCT/SNPs/S2/', sep = \"/\")\n",
    "if (!dir.exists(outputpath)) {\n",
    "    dir.create(outputpath, recursive = TRUE)\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aaa40717-f3ed-4f1a-9485-142f1db3a846",
   "metadata": {},
   "outputs": [],
   "source": [
    "## 1. bg\n",
    "name <- c('bg_all')\n",
    "for(k in 1:1){\n",
    "    if (file.exists(paste0(inputpath,name[k],'.bed'))){\n",
    "        print(name[k])\n",
    "        bed_file <- fread(paste0(inputpath,name[k],'.bed'), header = FALSE, sep = \"\\t\", quote = \"\")\n",
    "        chromatin <- sub(\":.*\", \"\", bed_file$V1)\n",
    "        positions <- gsub(\".*:\", \"\", bed_file$V1)\n",
    "        start <- as.numeric(gsub(\"-.*\", \"\", positions))\n",
    "        end <- as.numeric(gsub(\".*-\", \"\", positions))\n",
    "        peak <- data.frame(V1 = chromatin, V2 = start, V3 = end)\n",
    "    }else{\n",
    "        next\n",
    "    }\n",
    "    ind.temp <- find.index(peak,bim,type='pos')\n",
    "    bim1 <- bim[ind.temp$df2,c(1,4,2)]\n",
    "    colnames(bim1) <- c(\"CHR\",\"POS\",\"SNP\")\n",
    "    print(dim(bim1))\n",
    "    if (nrow(bim1) > 0){\n",
    "        write.table(bim1,paste0(outputpath,gsub(\" \",\"_\",name[k]),'.anno'),sep=\"\\t\",quote=F,col.names=T,row.names=F)\n",
    "    }else{\n",
    "        print('Skipped !')\n",
    "    }\n",
    "}\n",
    "\n",
    "## 2. specific annotations\n",
    "name <- c('Midbrain', 'Diencephalon_and_hindbrain', 'Basal_plate_of_hindbrain', 'Subpallium_1', 'Subpallium_2',\n",
    "          'Cartilage_1', 'Cartilage_2', 'Cartilage_3', 'Cartilage_4', 'Mesenchyme', 'Muscle', 'Thalamus', 'DPallm', 'DPallv')\n",
    "for(k in 1:14){\n",
    "    if (file.exists(paste0(inputpath,name[k],'.bed'))){\n",
    "        print(name[k])\n",
    "        bed_file <- fread(paste0(inputpath,name[k],'.bed'), header = FALSE, sep = \"\\t\", quote = \"\")\n",
    "        chromatin <- sub(\":.*\", \"\", bed_file$V1)\n",
    "        positions <- gsub(\".*:\", \"\", bed_file$V1)\n",
    "        start <- as.numeric(gsub(\"-.*\", \"\", positions))\n",
    "        end <- as.numeric(gsub(\".*-\", \"\", positions))\n",
    "        peak <- data.frame(V1 = chromatin, V2 = start, V3 = end)\n",
    "    }else{\n",
    "        next\n",
    "    }\n",
    "    ind.temp <- find.index(peak,bim,type='pos')\n",
    "    bim1 <- bim[ind.temp$df2,c(1,4,2)]\n",
    "    colnames(bim1) <- c(\"CHR\",\"POS\",\"SNP\")\n",
    "    print(dim(bim1))\n",
    "    if (nrow(bim1) > 0){\n",
    "        write.table(bim1,paste0(outputpath,gsub(\" \",\"_\",name[k]),'.anno'),sep=\"\\t\",quote=F,col.names=T,row.names=F)\n",
    "    }else{\n",
    "        print('Skipped !')\n",
    "    }"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "524be2b8-e3db-4ff7-8df3-75176010dc29",
   "metadata": {},
   "source": [
    "### Perform expression enrichment analysis using [SNPsea](https://github.com/slowkow/snpsea/)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c9ace60f-cd83-431c-a9de-b11b0ab21208",
   "metadata": {},
   "outputs": [],
   "source": [
    "# shell\n",
    "options=(\n",
    "    --snps              /home/liuyy/Final/results/MouseBrain_Jiang2023/vertical/E18_5/INSTINCT/SNPs/S2/Subpallium_2.anno\n",
    "    --gene-matrix       /home/liuyy/Final/SNPsea/SNPsea_data_20140520/GeneAtlas2004.gct.gz\n",
    "    --gene-intervals    /home/liuyy/Final/SNPsea/SNPsea_data_20140520/NCBIgenes2013.bed.gz\n",
    "    --snp-intervals     /home/liuyy/Final/SNPsea/SNPsea_data_20140520/TGP2011.bed.gz\n",
    "    --null-snps         /home/liuyy/Final/SNPsea/SNPsea_data_20140520/Lango2010.txt.gz\n",
    "    --out               /home/liuyy/Final/results/MouseBrain_Jiang2023/vertical/E18_5/INSTINCT/Expression_enrichment_analysis/S2/Subpallium_2.anno.out\n",
    "    --slop              10e3\n",
    "    --threads           8\n",
    "    --null-snpsets      0\n",
    "    --min-observations  100\n",
    "    --max-iterations    1e7\n",
    ")\n",
    "/home/liuyy/Final/SNPsea/snpsea_v1.0.2/bin/snpsea ${options[*]}\n",
    "/home/liuyy/Final/SNPsea/snpsea_v1.0.2/bin/snpsea-barplot_modified /home/liuyy/Final/results/MouseBrain_Jiang2023/vertical/E18_5/INSTINCT/Expression_enrichment_analysis/S2/Subpallium_2.anno.out --top 30 --fontsize 10"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}