{ "cells": [ { "cell_type": "markdown", "id": "4c02d766-2ca2-4ae1-8ef8-7a8498c1b6a4", "metadata": {}, "source": [ "# Expression enrichment analysis\n", "Identify the spot-type-specific peaks for each annotated spot-type, using the remaining peaks as background peaks, and performed expression enrichment analysis for the single nucleotide polymorphisms (SNPs) in each set of spot-type-specific peaks and the set of background peaks, respectively" ] }, { "cell_type": "code", "execution_count": null, "id": "76321e73-9a27-4a13-80d9-2b4a1b0a7e5c", "metadata": {}, "outputs": [], "source": [ "options(warn=-1)\n", "library(scABC)\n", "library(chromVAR)\n", "library(motifmatchr)\n", "library(SummarizedExperiment)\n", "library(Matrix)\n", "# library(ggplot2)\n", "library(BiocParallel)\n", "library(BSgenome.Hsapiens.UCSC.hg19)\n", "library(JASPAR2016)\n", "library(data.table)\n", "library(parallel)\n", "set.seed(1234)" ] }, { "cell_type": "markdown", "id": "461247b4-9a50-4b46-af19-9d6939ebf774", "metadata": {}, "source": [ "### Untar the downloaded reference data\n", "The reference data can be downloaded [here](https://doi.org/10.5281/zenodo.7768714)." ] }, { "cell_type": "code", "execution_count": null, "id": "54f21263-a78d-429e-938c-b38733274b11", "metadata": {}, "outputs": [], "source": [ "archive_file <- \"C:/University/FINAL/data/SNP_files/1000G_Phase3_plinkfiles.tgz\"\n", "extract_dir <- \"C:/University/FINAL/data/SNP_files/\"\n", "untar(archive_file, exdir = extract_dir)" ] }, { "cell_type": "code", "execution_count": null, "id": "a3063233-bdb2-4568-a56a-63fccdfbd332", "metadata": {}, "outputs": [], "source": [ "# find and return indices for overlaps\n", "find.index <- function(df1,df2,type='reg'){\n", " #colnames(df1) <- colnames(df2) <- c('V1','V2','V3')\n", " library(GenomicRanges)\n", " df1.gr = GRanges (IRanges(start = df1$V2, end = df1$V3), seqnames=df1$V1)\n", " if(type=='reg'){\n", " df2.gr = GRanges(IRanges(start=df2$V2, end = df2$V3), seqnames = df2$V1)\n", " }\n", " if(type=='pos'){\n", " df2.gr = GRanges(IRanges(start=df2$V4, end = df2$V4), seqnames = df2$V1)\n", " }\n", " df1.git = GNCList(df1.gr)\n", " df2.git = GNCList(df2.gr)\n", " overlap_git = findOverlaps(df2.git, df1.git)\n", " overlap_git\n", " temp <- as.data.frame(overlap_git)\n", " colnames(temp) <- c('df2','df1')\n", " return(temp)\n", "}" ] }, { "cell_type": "markdown", "id": "17139851-58f9-4476-b145-d4a6ea5678ba", "metadata": {}, "source": [ "### Find SNPs for each set of specific peaks\n", "The peaks should be first map to GRCh37/hg19 from mm10 using [LiftOver](http://genome.ucsc.edu/) and named accordingly." ] }, { "cell_type": "code", "execution_count": null, "id": "33c29569-7677-40fc-a335-91abaf433d33", "metadata": {}, "outputs": [], "source": [ "mode_list <- c('E11_0', 'E13_5', 'E15_5', 'E18_5')\n", "mode_index <- 4\n", "mode = mode_list[mode_index]\n", "\n", "\n", "# Load data\n", "data_path <- \"C:/University/FINAL/data/SNP_files/\"\n", "for(chr in 1:22){\n", " if(chr==1){\n", " # https://doi.org/10.5281/zenodo.7768714 -> 1000G_Phase3_plinkfiles.tgz\n", " bim <- fread(paste0(data_path, '1000G_EUR_Phase3_plink/1000G.EUR.QC.1.bim'))\n", " }else{\n", " bim <- rbind(bim,fread(paste0(data_path, '1000G_EUR_Phase3_plink/1000G.EUR.QC.',chr,'.bim')))\n", " }\n", "} ## 1000G bim file\n", "bim$V1 <- paste0('chr',bim$V1)\n", "\n", "\n", "inputpath <- paste(\"C:/University/FINAL/results/MouseBrain_Jiang2023/vertical\", mode, 'INSTINCT/GRCh37hg19/S2/', sep = \"/\")\n", "outputpath <- paste(\"C:/University/FINAL/results/MouseBrain_Jiang2023/vertical\", mode, 'INSTINCT/SNPs/S2/', sep = \"/\")\n", "if (!dir.exists(outputpath)) {\n", " dir.create(outputpath, recursive = TRUE)\n", "}" ] }, { "cell_type": "code", "execution_count": null, "id": "aaa40717-f3ed-4f1a-9485-142f1db3a846", "metadata": {}, "outputs": [], "source": [ "## 1. bg\n", "name <- c('bg_all')\n", "for(k in 1:1){\n", " if (file.exists(paste0(inputpath,name[k],'.bed'))){\n", " print(name[k])\n", " bed_file <- fread(paste0(inputpath,name[k],'.bed'), header = FALSE, sep = \"\\t\", quote = \"\")\n", " chromatin <- sub(\":.*\", \"\", bed_file$V1)\n", " positions <- gsub(\".*:\", \"\", bed_file$V1)\n", " start <- as.numeric(gsub(\"-.*\", \"\", positions))\n", " end <- as.numeric(gsub(\".*-\", \"\", positions))\n", " peak <- data.frame(V1 = chromatin, V2 = start, V3 = end)\n", " }else{\n", " next\n", " }\n", " ind.temp <- find.index(peak,bim,type='pos')\n", " bim1 <- bim[ind.temp$df2,c(1,4,2)]\n", " colnames(bim1) <- c(\"CHR\",\"POS\",\"SNP\")\n", " print(dim(bim1))\n", " if (nrow(bim1) > 0){\n", " write.table(bim1,paste0(outputpath,gsub(\" \",\"_\",name[k]),'.anno'),sep=\"\\t\",quote=F,col.names=T,row.names=F)\n", " }else{\n", " print('Skipped !')\n", " }\n", "}\n", "\n", "## 2. specific annotations\n", "name <- c('Midbrain', 'Diencephalon_and_hindbrain', 'Basal_plate_of_hindbrain', 'Subpallium_1', 'Subpallium_2',\n", " 'Cartilage_1', 'Cartilage_2', 'Cartilage_3', 'Cartilage_4', 'Mesenchyme', 'Muscle', 'Thalamus', 'DPallm', 'DPallv')\n", "for(k in 1:14){\n", " if (file.exists(paste0(inputpath,name[k],'.bed'))){\n", " print(name[k])\n", " bed_file <- fread(paste0(inputpath,name[k],'.bed'), header = FALSE, sep = \"\\t\", quote = \"\")\n", " chromatin <- sub(\":.*\", \"\", bed_file$V1)\n", " positions <- gsub(\".*:\", \"\", bed_file$V1)\n", " start <- as.numeric(gsub(\"-.*\", \"\", positions))\n", " end <- as.numeric(gsub(\".*-\", \"\", positions))\n", " peak <- data.frame(V1 = chromatin, V2 = start, V3 = end)\n", " }else{\n", " next\n", " }\n", " ind.temp <- find.index(peak,bim,type='pos')\n", " bim1 <- bim[ind.temp$df2,c(1,4,2)]\n", " colnames(bim1) <- c(\"CHR\",\"POS\",\"SNP\")\n", " print(dim(bim1))\n", " if (nrow(bim1) > 0){\n", " write.table(bim1,paste0(outputpath,gsub(\" \",\"_\",name[k]),'.anno'),sep=\"\\t\",quote=F,col.names=T,row.names=F)\n", " }else{\n", " print('Skipped !')\n", " }" ] }, { "cell_type": "markdown", "id": "524be2b8-e3db-4ff7-8df3-75176010dc29", "metadata": {}, "source": [ "### Perform expression enrichment analysis using [SNPsea](https://github.com/slowkow/snpsea/)" ] }, { "cell_type": "code", "execution_count": null, "id": "c9ace60f-cd83-431c-a9de-b11b0ab21208", "metadata": {}, "outputs": [], "source": [ "# shell\n", "options=(\n", " --snps /home/liuyy/Final/results/MouseBrain_Jiang2023/vertical/E18_5/INSTINCT/SNPs/S2/Subpallium_2.anno\n", " --gene-matrix /home/liuyy/Final/SNPsea/SNPsea_data_20140520/GeneAtlas2004.gct.gz\n", " --gene-intervals /home/liuyy/Final/SNPsea/SNPsea_data_20140520/NCBIgenes2013.bed.gz\n", " --snp-intervals /home/liuyy/Final/SNPsea/SNPsea_data_20140520/TGP2011.bed.gz\n", " --null-snps /home/liuyy/Final/SNPsea/SNPsea_data_20140520/Lango2010.txt.gz\n", " --out /home/liuyy/Final/results/MouseBrain_Jiang2023/vertical/E18_5/INSTINCT/Expression_enrichment_analysis/S2/Subpallium_2.anno.out\n", " --slop 10e3\n", " --threads 8\n", " --null-snpsets 0\n", " --min-observations 100\n", " --max-iterations 1e7\n", ")\n", "/home/liuyy/Final/SNPsea/snpsea_v1.0.2/bin/snpsea ${options[*]}\n", "/home/liuyy/Final/SNPsea/snpsea_v1.0.2/bin/snpsea-barplot_modified /home/liuyy/Final/results/MouseBrain_Jiang2023/vertical/E18_5/INSTINCT/Expression_enrichment_analysis/S2/Subpallium_2.anno.out --top 30 --fontsize 10" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 5 }