{ "cells": [ { "cell_type": "markdown", "id": "7a622c0a-ce58-4925-8276-a665d1a433d6", "metadata": {}, "source": [ "# Motif enrichment analysis" ] }, { "cell_type": "code", "execution_count": null, "id": "ca1b44b0-2b86-456b-b6ac-8802258d48de", "metadata": {}, "outputs": [], "source": [ "library(SummarizedExperiment)\n", "# library(Matrix)\n", "library(reticulate)\n", "use_condaenv(\"epi_integration\", required = TRUE)\n", "ad <- import(\"anndata\")" ] }, { "cell_type": "code", "execution_count": null, "id": "bdf7d68f-4dbf-45ea-9376-218862c62e67", "metadata": {}, "outputs": [], "source": [ "getbetaMLE <- function(data, cluster, offset, beta_ini, maxiter, thres, quiet){\n", " ## data is n by p\n", " ## cluster is the cluster membership matrix, length n. Take entries 1 to nCluster\n", " ## offset corresponds to h_i, here it is a vector of length n\n", " p <- ncol(data)\n", " x <- getdesign(cluster)\n", " beta <- beta_ini\n", " betaDiffs <- c()\n", " percConverged <- c()\n", " converged_flag <- rep(0, p)\n", "\n", " converged <- 0\n", " iter <- 1\n", " while (!converged & iter2){\n", " pvs <- c()\n", " for (i in 1:nCluster){\n", " pvs <- cbind(pvs, apply(pvsA[which(constrastsCluster==i),], 2, max))\n", " }\n", " } else {\n", " pvs <- t(pvsA)\n", " }\n", " return(list(beta=beta, pvalue=pvs))\n", "}\n", "\n", "\n", "getClusterSpecificPvalue <- function(data, cluster, offset, landmark=NULL, maxiter=1000, thresMLE=10^-3, thresMAP=10^-5, quiet=FALSE){\n", " ## the main function for peak selection\n", " ## data is nPeaks(p) by Cells(n)\n", " ## cluster is the cluster membership matrix, length n. Take entries 1 to nCluster\n", " ## offset corresponds to h_i, here it is a vector of length n\n", " ## landmark is optional. If landmark is provided, we only do hypothesis testing on the union of landmark peaks\n", " if (!is.null(landmark)){\n", " ## take union of the landmark peaks\n", " unionlandmark <- which(rowSums(landmark)!=0)\n", " p <- nrow(data)\n", " data <- data[unionlandmark,]\n", " } else {\n", " }\n", "\n", " data <- t(data) # make data n by p\n", " ## remove the offset=0 samples\n", " data <- data[which(offset>0),]\n", " cluster <- cluster[which(offset>0)]\n", " offset <- offset[which(offset>0)]\n", " ## get beta_MLE\n", " if (!quiet){\n", " cat(\"\\nEstimating beta MLE\\n\")\n", " }\n", " betaMLE_ini <- matrix(0, nrow=length(unique(cluster)), ncol=ncol(data))\n", " betaMLE <- getbetaMLE(data=data, cluster=cluster, offset=offset, beta_ini=betaMLE_ini, maxiter=maxiter, thres=thresMLE, quiet=quiet)\n", "\n", " ## get the empirical prior sigma\n", " sigmas <- getSigmaPrior(betaMLE)\n", "\n", " ## get beta_MAP and the p-value\n", " if (!quiet){\n", " cat(\"\\nEstimating beta MAP\\n\")\n", " }\n", " betaMAP_ini <- rbind(0, matrix(0, nrow=length(unique(cluster)), ncol=ncol(data)))\n", " result <- getbetaMAP(data=data, cluster=cluster, offset=offset, sigmas=sigmas, beta_ini=betaMAP_ini, maxiter=maxiter, thres=thresMAP, quiet=quiet)\n", "\n", " if (!is.null(landmark)){\n", " betaMAP <- matrix( nrow=length(unique(cluster))+1, ncol=p )\n", " pvalue <- matrix( nrow=p, ncol=length(unique(cluster)) )\n", " betaMAP[, unionlandmark] <- result$beta\n", " pvalue[unionlandmark,] <- result$pvalue\n", " } else {\n", " betaMAP <- result$beta\n", " pvalue <- result$pvalue\n", " }\n", " colnames(pvalue) <- paste(\"Cluster\", 1:length(unique(cluster)) )\n", " return(list(betaMAP=betaMAP, sigmas=sigmas, pvalue=pvalue))\n", "}" ] }, { "cell_type": "markdown", "id": "f65b93fd-d121-4d98-b202-5ea5d35dc446", "metadata": {}, "source": [ "### Load data with selected peaks" ] }, { "cell_type": "code", "execution_count": null, "id": "101e0cd1-a703-4b7d-9e9e-1b08591b6b65", "metadata": {}, "outputs": [], "source": [ "labels <- c('GSM5238385', 'GSM5238386', 'GSM5238387')\n", "slice_name_list <- c('GSM5238385_ME11_50um', 'GSM5238386_ME13_50um', 'GSM5238387_ME13_50um_2')\n", "\n", "save_dir <- 'C:/University/FINAL/results/HumanMouse_Deng2022/[0, 1, 2]/concat/'\n", "if (!dir.exists(paste0(save_dir, 'motif_enrichment_analysis/'))) {\n", " dir.create(paste0(save_dir, 'motif_enrichment_analysis/'), recursive = TRUE)\n", "}\n", "\n", "adata <- ad$read_h5ad(paste0(save_dir, \"selected_concat.h5ad\"))" ] }, { "cell_type": "markdown", "id": "580e7d7a-c2ea-4259-ae67-a3eae7545e63", "metadata": {}, "source": [ "### Find enriched motifs" ] }, { "cell_type": "code", "execution_count": null, "id": "46e096c8-ddae-43bb-8f9e-1d77debfe875", "metadata": {}, "outputs": [], "source": [ "# row_data <- adata$var\n", "col_data <- adata$obs\n", "metaData <- data.frame(cluster_label=(adata$obs$leiden))\n", "metaDataInd = with(metaData, order(cluster_label))\n", "col_data <- col_data[metaDataInd,]\n", "count_matrix <- t(adata$X)\n", "count_matrix = count_matrix[,metaDataInd]\n", "# colnames(count_matrix)=1:ncol(count_matrix)\n", "\n", "sorted_peaks <- readLines(paste0(save_dir, 'peaks/all_specific_peaks.txt'))\n", "\n", "chr=c()\n", "p1=c()\n", "p2=c()\n", "for (i in 1:length(sorted_peaks)) {\n", " strings <- unlist(strsplit(sorted_peaks[[i]], \":\"))\n", " chr=c(chr,strings[1])\n", " strings <- unlist(strsplit(strings[2], \"-\"))\n", " p1=c(p1,strings[1])\n", " p2=c(p2,strings[2])\n", "}\n", "\n", "peakrange=data.frame(chr,p1,p2)\n", "peakrange$p1=as.numeric(as.character(peakrange$p1))\n", "peakrange$p2=as.numeric(as.character(peakrange$p2))\n", "head(peakrange)\n", "peaks.gr = GRanges(peakrange[,1], IRanges(peakrange[,2], peakrange[,3]))\n", "\n", "\n", "frag_counts = SummarizedExperiment(\n", " assays = SimpleList(counts = count_matrix),\n", " rowRanges = peaks.gr,\n", " colData = col_data,\n", ")\n", "\n", "library(chromVAR)\n", "library(motifmatchr)\n", "library(BSgenome.Mmusculus.UCSC.mm10)\n", "\n", "frag_counts = addGCBias(frag_counts, genome = BSgenome.Mmusculus.UCSC.mm10)\n", "motifs <- getJasparMotifs(species = \"Mus musculus\")\n", "motifs.matched = matchMotifs(motifs, frag_counts, genome = BSgenome.Mmusculus.UCSC.mm10)\n", "dev = computeDeviations(object = frag_counts, annotations = motifs.matched)\n", "dev.scores = deviationScores(dev)\n", "print(dim(dev.scores))\n", "\n", "# na_columns <- colSums(is.na(dev.scores)) > 0\n", "# na_columns_names <- colnames(dev.scores)[na_columns]\n", "# dev_filtered <- dev[, !na_columns]\n", "# dev <- dev_filtered\n", "# dev.scores <- dev.scores[, !na_columns]\n", "# print(dim(dev.scores))\n", "\n", "variability = computeVariability(dev)\n", "\n", "var_plot <- plotVariability(variability, use_plotly = TRUE)\n", "\n", "\n", "top_motifs = variability$name[head(order(variability$variability, decreasing = TRUE), 50)]\n", "names(top_motifs) = rownames(variability[head(order(variability$variability, decreasing = TRUE), 50), ])\n", "top_devs = dev.scores[which(rownames(dev.scores) %in% names(top_motifs)), ]\n", "rownames(top_devs) = top_motifs[match(rownames(top_devs), names(top_motifs))]\n", "\n", "top_devs_t = t(top_devs)\n", "write.csv(top_devs_t, paste0(save_dir, \"motif_enrichment_analysis/sorted_devs.csv\"))" ] }, { "cell_type": "markdown", "id": "4c6a7f54-8041-402a-9941-db5da143bfa8", "metadata": {}, "source": [ "### Plot enriched motifs\n", "Below are python codes" ] }, { "cell_type": "code", "execution_count": null, "id": "8bad6320-96ca-4aa2-9981-67aa1bf42e1d", "metadata": {}, "outputs": [], "source": [ "import os\n", "import numpy as np\n", "import anndata as ad\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "import matplotlib as mpl\n", "mpl.rcParams['pdf.fonttype'] = 42\n", "mpl.rcParams['ps.fonttype'] = 42" ] }, { "cell_type": "code", "execution_count": null, "id": "7b68edbb-2aa6-41e2-84f1-c1c681e007aa", "metadata": {}, "outputs": [], "source": [ "save = True\n", "method = 'leiden'\n", "file_format = 'png'\n", "cmap = 'bwr'\n", "\n", "data_dir = '../../data/spCASdata/HumanMouse_Deng2022/preprocessed/'\n", "save_dir = '../../results/HumanMouse_Deng2022/'\n", "slice_name_list = ['GSM5238385_ME11_50um', 'GSM5238386_ME13_50um', 'GSM5238387_ME13_50um_2']\n", "label_list = ['GSM5238385', 'GSM5238386', 'GSM5238387']\n", "slice_used = [0, 1, 2]\n", "slice_name_list = [slice_name_list[i] for i in slice_used]\n", "label_list = [label_list[i] for i in slice_used]\n", "slice_index_list = list(range(len(slice_name_list)))\n", "\n", "save_dir = f'../../results/HumanMouse_Deng2022/{slice_used}/'\n", "\n", "motif_list = ['Gata1', 'Gata4', 'Gabpa', 'Klf1', 'Klf12', 'FOS::JUN', 'Nfe2l2',\n", " 'Pou5f1::Sox2', 'Rfx1', 'Pou2f3', 'Sox3', 'Sox6', 'Sox2']\n", "save_name_list = ['Gata1', 'Gata4', 'Gabpa', 'Klf1', 'Klf12', 'FOS_JUN', 'Nfe2l2',\n", " 'Pou5f1_Sox2', 'Rfx1', 'Pou2f3', 'Sox3', 'Sox6', 'Sox2']\n", "\n", "motif_scores = pd.read_csv(save_dir + f'concat/motif_enrichment_analysis/sorted_devs.csv', index_col=0, header=0)" ] }, { "cell_type": "code", "execution_count": null, "id": "b53e3798-4c06-4824-9373-36bb715b5637", "metadata": {}, "outputs": [], "source": [ "for k, motif in enumerate(motif_list):\n", "\n", " print(motif)\n", "\n", " if not os.path.exists(save_dir + f'concat/motif_enrichment_analysis/{save_name_list[k]}/') and save:\n", " os.makedirs(save_dir + f'concat/motif_enrichment_analysis/{save_name_list[k]}/')\n", "\n", " adata_concat = ad.read_h5ad(save_dir + f'concat/selected_concat.h5ad')\n", " print(adata_concat.shape)\n", "\n", " adata_concat.obsm['raw_emb'] = pd.read_csv(save_dir + f'sp_embeddings_raw.csv', header=None).values\n", " adata_concat.obsm['inte_emb'] = pd.read_csv(save_dir + f'sp_embeddings_integrated.csv', header=None).values\n", "\n", " adata_concat = adata_concat[adata_concat.obs_names.isin(motif_scores.index)]\n", " adata_concat.obs[motif] = motif_scores.loc[adata_concat.obs_names, motif].values\n", " adata_concat = adata_concat[~adata_concat.obs[motif].isna()]\n", " print(adata_concat.shape)\n", "\n", " data_column = adata_concat.obs[motif].copy()\n", "\n", " # raw\n", " sp_embedding = adata_concat.obsm['raw_emb'].copy()\n", " n_spots = adata_concat.shape[0]\n", " size = 10000 / n_spots\n", " order = np.arange(n_spots)\n", " plt.figure(figsize=(6, 5))\n", " plt.scatter(sp_embedding[order, 0], sp_embedding[order, 1], s=size, c=data_column, cmap=cmap)\n", " plt.tick_params(axis='both', bottom=False, top=False, left=False, right=False,\n", " labelleft=False, labelbottom=False, grid_alpha=0)\n", " plt.colorbar(label='Color')\n", "\n", " plt.title(f'Raw ({motif})', fontsize=14)\n", " if save:\n", " save_path = save_dir + f\"concat/motif_enrichment_analysis/{save_name_list[k]}/{save_name_list[k]}_raw_umap.{file_format}\"\n", " plt.savefig(save_path)\n", "\n", " # integrated\n", " sp_embedding = adata_concat.obsm['inte_emb'].copy()\n", " n_spots = adata_concat.shape[0]\n", " size = 10000 / n_spots\n", " order = np.arange(n_spots)\n", " plt.figure(figsize=(6, 5))\n", " plt.scatter(sp_embedding[order, 0], sp_embedding[order, 1], s=size, c=data_column, cmap=cmap)\n", " plt.tick_params(axis='both', bottom=False, top=False, left=False, right=False,\n", " labelleft=False, labelbottom=False, grid_alpha=0)\n", " plt.colorbar(label='Color')\n", "\n", " plt.title(f'Integrated ({motif})', fontsize=14)\n", " if save:\n", " save_path = save_dir + f\"concat/motif_enrichment_analysis/{save_name_list[k]}/{save_name_list[k]}_integrated_umap.{file_format}\"\n", " plt.savefig(save_path)\n", "\n", " scalar_mappable = plt.cm.ScalarMappable(cmap=cmap)\n", " colors = scalar_mappable.to_rgba(data_column)\n", "\n", " spots_count = [0]\n", " n = 0\n", " for label in label_list:\n", " filtered_rows = adata_concat.obs[adata_concat.obs['slice_name'] == label]\n", " num = filtered_rows.shape[0]\n", " n += num\n", " spots_count.append(n)\n", "\n", " if len(slice_name_list) == 2:\n", " fig, axs = plt.subplots(1, 2, figsize=(8, 4))\n", " elif len(slice_name_list) == 3:\n", " fig, axs = plt.subplots(1, 3, figsize=(12, 4))\n", " for i in range(len(slice_name_list)):\n", " cluster_colors = list(colors[spots_count[i]: spots_count[i + 1]])\n", " axs[i].scatter(adata_concat[spots_count[i]: spots_count[i + 1]].obsm['spatial'][:, 0],\n", " adata_concat[spots_count[i]: spots_count[i + 1]].obsm['spatial'][:, 1],\n", " linewidth=1, s=40, marker=\".\", color=cluster_colors, alpha=0.9)\n", " axs[i].invert_yaxis()\n", " axs[i].set_title(f'{label_list[i]} ({motif})', size=12)\n", " axs[i].axis('off')\n", " plt.gcf().subplots_adjust(left=0.05, top=0.8, bottom=0.05, right=0.90)\n", " if save:\n", " save_path = save_dir + f'concat/motif_enrichment_analysis/{save_name_list[k]}/{save_name_list[k]}_spatial.{file_format}'\n", " plt.savefig(save_path)\n", " plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 5 }