fuxialexander commited on
Commit
7b04c5f
·
1 Parent(s): 80fdd7a
Dockerfile-dockerhub CHANGED
@@ -1,5 +1,5 @@
1
  # This is the dockerfile for dockerhub fuxialexander/getdemo:latest
2
- FROM mambaorg/micromamba
3
 
4
 
5
  USER root
@@ -8,48 +8,17 @@ USER $MAMBA_USER
8
 
9
  # Set the working directory in the container to /app
10
  WORKDIR /app
11
- # Create a new environment using mamba with specified packages
12
- RUN micromamba install -n base -c conda-forge -c bioconda -y python=3.10 pip biopython s3fs
13
- RUN micromamba install -n base -c conda-forge -c bioconda -y nglview tqdm matplotlib pandas
14
- RUN micromamba install -n base -c conda-forge -c bioconda -y openpyxl pyarrow python-box xmlschema seaborn numpy py3Dmol pyranges scipy pyyaml zarr numcodecs
15
- RUN micromamba install -n base -c conda-forge -c bioconda -y pybigwig networkx plotly pysam requests seqlogo MOODS urllib3 pyliftover gprofiler-official pyfaidx
16
- RUN micromamba install -n base -c conda-forge -y dash-bio
17
 
18
  ARG MAMBA_DOCKERFILE_ACTIVATE=1
19
- # Activate the environment and install additional packages via pip
20
- RUN pip3 install gradio
21
- USER root
22
- RUN mkdir /data
23
- RUN apt-get update && apt-get install -y --no-install-recommends \
24
- git \
25
- ssh \
26
- && apt-get clean \
27
- && rm -rf /var/lib/apt/lists/*
28
 
29
  USER $MAMBA_USER
30
 
31
- # copy modules from local to container
32
- COPY --chown=$MAMBA_USER:$MAMBA_USER modules /app/modules
33
-
34
  # copy modules from local to container
35
  COPY --chown=$MAMBA_USER:$MAMBA_USER app /app/app
36
 
37
- # copy modules from local to container
38
- # COPY --chown=$MAMBA_USER:$MAMBA_USER data /app/data
39
-
40
- # Clone a specific git repository and install it as an editable package
41
-
42
- RUN cd modules/proscope && \
43
- pip3 install .
44
-
45
- RUN cd modules/atac_rna_data_processing && \
46
- pip3 install .
47
-
48
  # clean all mamba caches and remove unnecessary files
49
  RUN micromamba clean --all --yes
50
 
51
-
52
-
53
  WORKDIR /app
54
 
55
  # Make port 80 available to the world outside this container
@@ -57,4 +26,4 @@ EXPOSE 7860
57
  # Set the working directory where your app resides
58
 
59
  # Command to run the Gradio app automatically
60
- CMD ["python", "app/main.py", "-p", "7860", "-s", "-u", "s3://2023-get-xf2217/get_demo", "-d", "/data"]
 
1
  # This is the dockerfile for dockerhub fuxialexander/getdemo:latest
2
+ FROM fuxialexander/get_model:latest
3
 
4
 
5
  USER root
 
8
 
9
  # Set the working directory in the container to /app
10
  WORKDIR /app
 
 
 
 
 
 
11
 
12
  ARG MAMBA_DOCKERFILE_ACTIVATE=1
 
 
 
 
 
 
 
 
 
13
 
14
  USER $MAMBA_USER
15
 
 
 
 
16
  # copy modules from local to container
17
  COPY --chown=$MAMBA_USER:$MAMBA_USER app /app/app
18
 
 
 
 
 
 
 
 
 
 
 
 
19
  # clean all mamba caches and remove unnecessary files
20
  RUN micromamba clean --all --yes
21
 
 
 
22
  WORKDIR /app
23
 
24
  # Make port 80 available to the world outside this container
 
26
  # Set the working directory where your app resides
27
 
28
  # Command to run the Gradio app automatically
29
+ CMD ["python", "app/main.py"]
README.md CHANGED
@@ -7,44 +7,3 @@ sdk: docker
7
  license: cc-by-nc-4.0
8
  pinned: false
9
  ---
10
-
11
-
12
- # Data preparation
13
- Put the data in the following structure in the root directory of the project.
14
- ```bash
15
- data
16
- ├── sequences
17
- │ └── causal
18
- │ ├── MECP2_TFAP2A
19
- │ ├── PRDM1_SMAD2
20
- │ └── TAF1_ZFX
21
- └── structures
22
- ├── causal
23
- │ ├── MECP2_TFAP2A
24
- │ ├── PRDM1_SMAD2
25
- │ └── TAF1_ZFX
26
- └── homodimer
27
- ├── PRDM1
28
- ├── SMAD2
29
- ├── TAF1
30
- └── ZFX
31
- ```
32
-
33
- # Installation
34
- ```bash
35
- git clone --recursive [email protected]:fuxialexander/getdemo.git
36
- cd getdemo
37
- docker pull fuxialexander/getdemo:latest
38
- docker run -it -v "/path/to/data:/data" --rm -p 7860:7860 fuxialexander/getdemo
39
- # or
40
- singularity run -w --bind /manitou/pmg/users/xf2217/getdemo:/app --bind /manitou/pmg/users/xf2217/demo_data:/data --bind /pmglocal/xf2217/tmp:/tmp --no-home --pwd /app getdemo
41
- ```
42
- The gradio interface will be available at http://127.0.0.1:7860, a sharable link will be printed in the terminal.
43
-
44
- # Build
45
- ```bash
46
- git clone --recursive [email protected]:fuxialexander/getdemo.git
47
- cd getdemo
48
- docker build -t getdemo .
49
- docker run -it -v "/path/to/data:/data" --rm -p 7860:7860 getdemo
50
- ```
 
7
  license: cc-by-nc-4.0
8
  pinned: false
9
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/main.py CHANGED
@@ -1,117 +1,59 @@
1
- import argparse
2
- import os
3
 
4
  import gradio as gr
5
  import matplotlib.pyplot as plt
6
  import pandas as pd
7
- import pkg_resources
8
- from dash_bio import Clustergram
9
- import sys
10
  import s3fs
11
- from glob import glob
12
- import numpy as np
13
-
14
- from atac_rna_data_processing.config.load_config import load_config
15
- from atac_rna_data_processing.io.celltype import GETCellType
16
- from atac_rna_data_processing.io.nr_motif_v1 import NrMotifV1
17
- from proscope.af2 import GETAFPairseg
18
- from proscope.data import get_genename_to_uniprot, get_lddt, get_seq
19
- from proscope.protein import Protein
20
- from proscope.viewer import view_pdb_html
21
-
22
-
23
- seq = get_seq()
24
- genename_to_uniprot = get_genename_to_uniprot()
25
- lddt = get_lddt()
26
-
27
- args = argparse.ArgumentParser()
28
- args.add_argument("-p", "--port", type=int, default=7860, help="Port number")
29
- args.add_argument("-s", "--share", action="store_true", help="Share on network")
30
- args.add_argument("-u", "--s3_uri", type=str, default=None, help="Path to demo S3 bucket")
31
- args.add_argument("-d", "--data", type=str, default=None, help="Data directory")
32
- args.add_argument("-n", "--host", type=str, default="127.0.0.1")
33
- args = args.parse_args()
34
-
35
- GET_CONFIG = load_config(
36
- "/app/modules/atac_rna_data_processing/atac_rna_data_processing/config/GET"
37
- )
38
- GET_CONFIG.celltype.jacob = True
39
- GET_CONFIG.celltype.num_cls = 2
40
- GET_CONFIG.celltype.input = True
41
- GET_CONFIG.celltype.embed = True
42
  plt.rcParams["figure.dpi"] = 100
43
 
44
- if args.s3_uri: # Use S3 path if exists
45
  s3_file_sys = s3fs.S3FileSystem(anon=True)
46
- GET_CONFIG.s3_file_sys = s3_file_sys
47
- GET_CONFIG.celltype.data_dir = (
48
- f"{args.s3_uri}/pretrain_human_bingren_shendure_apr2023/fetal_adult/"
49
  )
50
- GET_CONFIG.celltype.interpret_dir = (
51
- f"{args.s3_uri}/Interpretation_all_hg38_allembed_v4_natac/"
52
  )
53
- GET_CONFIG.motif_dir = f"{args.s3_uri}/interpret_natac/motif-clustering/"
54
- GET_CONFIG.assets_dir = f"{args.s3_uri}/assets/"
55
  cell_type_annot = pd.read_csv(
56
- GET_CONFIG.celltype.data_dir.split("fetal_adult")[0]
57
- + "data/cell_type_pretrain_human_bingren_shendure_apr2023.txt"
58
  )
59
  cell_type_id_to_name = dict(zip(cell_type_annot["id"], cell_type_annot["celltype"]))
60
  cell_type_name_to_id = dict(zip(cell_type_annot["celltype"], cell_type_annot["id"]))
61
  available_celltypes = sorted(
62
  [
63
  cell_type_id_to_name[f.split("/")[-1]]
64
- for f in s3_file_sys.glob(GET_CONFIG.celltype.interpret_dir + "*")
65
  ]
66
  )
67
- gene_pairs = s3_file_sys.glob(f"{args.s3_uri}/structures/causal/*")
68
- gene_pairs = [os.path.basename(pair) for pair in gene_pairs]
69
- motif = NrMotifV1.load_from_pickle(
70
- pkg_resources.resource_filename("atac_rna_data_processing", "data/NrMotifV1.pkl"),
71
- GET_CONFIG.motif_dir,
72
- )
73
- else: # Run with local data
74
- GET_CONFIG.s3_file_sys = None
75
- GET_CONFIG.celltype.data_dir = (
76
- f"{args.data}/pretrain_human_bingren_shendure_apr2023/fetal_adult/"
77
- )
78
- GET_CONFIG.celltype.interpret_dir = (
79
- f"{args.data}/Interpretation_all_hg38_allembed_v4_natac/"
80
- )
81
- GET_CONFIG.motif_dir = f"{args.data}/interpret_natac/motif-clustering/"
82
- GET_CONFIG.assets_dir = f"{args.data}/assets/"
83
- cell_type_annot = pd.read_csv(
84
- GET_CONFIG.celltype.data_dir.split("fetal_adult")[0]
85
- + "data/cell_type_pretrain_human_bingren_shendure_apr2023.txt"
86
- )
87
- cell_type_id_to_name = dict(zip(cell_type_annot["id"], cell_type_annot["celltype"]))
88
- cell_type_name_to_id = dict(zip(cell_type_annot["celltype"], cell_type_annot["id"]))
89
- available_celltypes = sorted(
90
- [
91
- cell_type_id_to_name[f.split("/")[-1]]
92
- for f in glob(GET_CONFIG.celltype.interpret_dir + "*")
93
- ]
94
- )
95
- gene_pairs = glob(f"{args.data}/structures/causal/*")
96
- gene_pairs = [os.path.basename(pair) for pair in gene_pairs]
97
- motif = NrMotifV1.load_from_pickle(
98
- pkg_resources.resource_filename("atac_rna_data_processing", "data/NrMotifV1.pkl"),
99
- GET_CONFIG.motif_dir,
100
- )
101
 
102
  def visualize_AF2(tf_pair, a):
103
- if args.s3_uri:
104
- strcture_dir = f"{args.s3_uri}/structures/causal/{tf_pair}"
105
- fasta_dir = f"{args.s3_uri}/sequences/causal/{tf_pair}"
106
- else:
107
- strcture_dir = f"{args.data}/structures/causal/{tf_pair}"
108
- fasta_dir = f"{args.data}/sequences/causal/{tf_pair}"
109
-
110
- if not os.path.exists(strcture_dir):
111
- gr.ErrorText("No such gene pair")
112
-
113
- a = GETAFPairseg(strcture_dir, fasta_dir, GET_CONFIG)
114
- # segpair.choices = list(a.pairs_data.keys())
115
  fig1 = a.plotly_plddt_gene1()
116
  fig2 = a.plotly_plddt_gene2()
117
  fig5, ax5 = a.plot_score_heatmap()
@@ -121,9 +63,12 @@ def visualize_AF2(tf_pair, a):
121
 
122
 
123
  def view_pdb(seg_pair, a):
 
 
 
124
  pdb_path = a.pairs_data[seg_pair].pdb
125
- if args.s3_uri:
126
- bucket_name = f"{args.s3_uri}".split("//")[1].split("/")[0]
127
  path_in_bucket = pdb_path.split("/", 1)[1]
128
  file_name = pdb_path.split("/")[-1]
129
  output_path = f"https://{bucket_name}.s3.amazonaws.com/{path_in_bucket}"
@@ -131,58 +76,62 @@ def view_pdb(seg_pair, a):
131
  ### Download PDB
132
  [{file_name}]({output_path})
133
  """
134
- else: # No download link if running locally
135
  output_text = ""
136
- return view_pdb_html(pdb_path, s3_file_sys=GET_CONFIG.s3_file_sys), a, output_text
137
 
138
 
139
  def update_dropdown(x, label):
140
- return gr.Dropdown.update(choices=x, label=label)
 
 
 
141
 
142
 
143
- def load_and_plot_celltype(celltype_name, GET_CONFIG, cell):
 
 
 
144
  celltype_id = cell_type_name_to_id[celltype_name]
145
- cell = GETCellType(celltype_id, GET_CONFIG)
146
  cell.celltype_name = celltype_name
147
  gene_exp_fig = cell.plotly_gene_exp()
148
  return gene_exp_fig, cell
149
 
150
 
151
- def plot_gene_regions(cell, gene_name, plotly=True):
 
 
 
152
  return cell.plot_gene_regions(gene_name, plotly=plotly), cell
153
 
154
 
155
- def plot_gene_motifs(cell, gene_name, motif, overwrite=False):
 
 
 
156
  return cell.plot_gene_motifs(gene_name, motif, overwrite=overwrite)[0], cell
157
 
158
 
159
- def plot_motif_subnet(cell, motif_collection, m, type="neighbors", threshold=0.1):
 
 
 
 
 
160
  return (
161
  cell.plotly_motif_subnet(motif_collection, m, type=type, threshold=threshold),
162
  cell,
163
  )
164
 
165
 
166
- def plot_gene_exp(cell, plotly=True):
 
 
 
167
  return cell.plotly_gene_exp(plotly=plotly), cell
168
 
169
 
170
- def plot_motif_corr(cell):
171
- fig = Clustergram(
172
- data=cell.gene_by_motif.corr.values,
173
- column_labels=list(cell.gene_by_motif.corr.columns.values),
174
- row_labels=list(cell.gene_by_motif.corr.index),
175
- hidden_labels=["row", "col"],
176
- # link_method="ward",
177
- display_ratio=0.1,
178
- width=600,
179
- height=350,
180
- color_map="rdbu_r",
181
- )
182
- fig["layout"].update(coloraxis_showscale=False)
183
- return fig, cell
184
-
185
-
186
  if __name__ == "__main__":
187
  with gr.Blocks(theme="sudeepshouche/minimalist") as demo:
188
  seg_pairs = gr.State([""])
@@ -190,25 +139,13 @@ if __name__ == "__main__":
190
  cell = gr.State(None)
191
 
192
  gr.Markdown(
193
- """# 🌟 GET: A Foundation Model of Transcription Across Human Cell Types 🌟
194
-
195
- Here we introduce GET, an innovative computational model aimed at understanding transcriptional regulation across 213 human fetal and adult cell types.
196
- Built solely on chromatin accessibility and sequence data, GET exhibits unparalleled generalizability and accuracy in predicting gene expression, even in previously unstudied cell types.
197
- The model adapts seamlessly across various sequencing platforms and assays, allowing inference of broad-spectrum regulatory activity.
198
- We validate GET's efficacy through its superior prediction of lentivirus-based massive parallel reporter assay outcomes and its ability to identify previously elusive distant regulatory regions in fetal erythroblasts.
199
- Moreover, our model reveals both universal and cell type-specific transcription factor interaction networks.
200
- Utilizing this comprehensive catalog, we elucidate the functional significance of a previously unidentified germline coding variant in PAX5, a lymphoma-associated transcription factor.
201
- Overall, GET serves as a robust, generalizable framework for understanding cell type-specific gene regulation and transcription factor interactions.
202
 
203
- Dive deep into our live demo and experience a revolution in cellular transcription like never before. Here's what you can explore:
 
204
 
205
- - 🔍 Prediction Performance: Choose your cell type and be amazed as we unveil a vivid plot comparing observed versus forecasted gene expression levels.
206
- - 🧬 Cell-type Specific Regulatory Insights: Just pick a gene, and voilà! Revel in intricate plots revealing the cell-type specific regulatory landscapes and motifs.
207
- - 🔗 Motif Correlation & Causal Subnetworks: Engage with our intuitive heatmap to witness motif correlations. Go further - choose a motif, define your subnetwork preference, set an effect size threshold, and behold the magic unfold!
208
- - 🔬 Structural Atlas of Interactions: Step into the realm of transcription factor pairs. Experience heatmaps, pLDDT metrics, and more. And guess what? You can even download the PDB file for select segment pairs!
209
-
210
- Stay tuned! We're set to dazzle you further as we launch our demo on Huggingface this week. Questions, thoughts, or moments of awe? Don't hesitate to reach out!
211
-
212
  """
213
  )
214
 
@@ -219,22 +156,33 @@ Stay tuned! We're set to dazzle you further as we launch our demo on Huggingface
219
  """
220
  ## 🔍 Prediction performance
221
 
222
- This section enables you to select different cell types and generates a plot that compares observed gene expression levels to predicted ones. It's important to note that for cell types without available observed gene expression data, the plot will display a vertical line at 0, indicating the absence of empirical expression data for those particular cell types. This visualization helps assess the accuracy of gene expression predictions in the context of different cell types.
 
 
 
 
223
  """
224
  )
225
  celltype_name = gr.Dropdown(
226
- label="Cell Type", choices=available_celltypes, value='Fetal Astrocyte 1'
 
 
227
  )
228
  celltype_btn = gr.Button(value="Load & plot gene expression")
229
- gene_exp_plot = gr.Plot(label="Gene expression prediction vs observation")
 
 
230
 
231
  # Right column: Plot gene motifs
232
  with gr.Column():
233
  gr.Markdown(
234
  """
235
- ### 🧬 Cell-type specific regulatory inference
236
 
237
- In this section, you can choose a specific gene and access visualizations of its cell-type specific regulatory regions and motifs that promote gene expression. When you hover over the highlighted regions (the top 10%), you'll be able to view information about the motifs present in those regions and their corresponding scores. This feature allows for a detailed exploration of the regulatory elements influencing the expression of the selected gene.
 
 
 
238
  """
239
  )
240
  gene_name_for_region = gr.Textbox(
@@ -249,54 +197,42 @@ In this section, you can choose a specific gene and access visualizations of its
249
 
250
  gr.Markdown(
251
  """
252
- ## 🔗 Motif correlation and causal subnetworks
253
-
254
- Motif correlation, as it relates to a cell-type specific gene-by-motif matrix, signifies the examination of associations between specific DNA sequence motifs and the expression patterns of genes in a particular cell type. This analysis is grounded in the concept that a correlation between a motif and gene expression implies co-regulation of downstream target genes, suggesting functional interactions between the regulatory motif and the genes it influences.
255
-
256
- In simpler terms, when you observe a motif having a strong positive correlation with the expression of certain genes in a specific cell type, it suggests that this motif is associated with the coordinated regulation of those genes. This correlation indicates that the motif likely plays a role in controlling the activity of those genes, possibly by acting as a binding site for transcription factors or other regulatory proteins. Conversely, a negative correlation might suggest that the motif is associated with the repression of those genes.
257
-
258
- Overall, motif correlation analysis helps uncover potential regulatory relationships within a cell type by identifying motifs that are statistically linked to the expression patterns of genes. This can provide valuable insights into the functional interactions and regulatory mechanisms at play in that specific biological context.
259
  """
260
  )
261
- with gr.Row() as row:
262
- with gr.Column():
263
- clustergram_btn = gr.Button(value="Plot motif correlation heatmap")
264
- clustergram_plot = gr.Plot(label="Motif correlation")
265
 
266
- # Right column: Motif subnet plot
267
- with gr.Column():
268
- with gr.Row() as row:
269
- motif_for_subnet = gr.Dropdown(
270
- label="Motif causal subnetwork", choices=motif.cluster_names, value='KLF/SP/2'
271
- )
272
- subnet_type = gr.Dropdown(
273
- label="Interaction type",
274
- choices=["neighbors", "parents", "children"],
275
- value="neighbors",
276
- )
277
- # slider for threshold 0.01-0.2
278
- subnet_threshold = gr.Slider(
279
- label="Threshold",
280
- minimum=0.01,
281
- maximum=0.25,
282
- step=0.01,
283
- value=0.1,
284
- )
285
- subnet_btn = gr.Button(value="Plot Motif Causal Subnetwork")
286
- subnet_plot = gr.Plot(label="Motif Causal Subnetwork")
287
 
288
  gr.Markdown(
289
  """
290
  ## 🔬 Structural atlas of TF-TF and TF-EP300 interactions
291
 
292
  This section allows you to explore transcription factor pairs within a causal network. You can visualize metrics like Heatmaps and pLDDT (predicted Local Distance Difference Test) for both proteins in the pair.
293
-
294
  The first row displays the pLDDT segmentation plot for the two TFs, helping to identify protein disorder regions. Each TF is divided into disordered and ordered segments labeled numerically as ZFX_0, ZFX_1, etc., with disordered segments marked in red. Uniprot annotations are included if available.
295
-
296
  The second row shows the interaction pLDDT plot. It compares pLDDT scores between segment pairs from AlphaFold2 predictions, indicating regions stabilized by TF interactions.
297
-
298
  The third row presents a heatmap plot, including:
299
-
300
  - *Interchain min pAE*: lower scores indicate stronger protein-protein interactions.
301
  - *Mean pLDDT*: higher scores signify greater prediction confidence or (inverse-)disorderness.
302
  - *ipTM*: higher scores reflect better predicted interaction quality by AlphaFold2.
@@ -306,13 +242,12 @@ You can download specific segment pair PDB files by clicking 'Get PDB.'
306
  """
307
  )
308
 
309
-
310
  with gr.Row() as row:
311
  with gr.Column():
312
  tf_pairs = gr.Dropdown(label="TF pair", choices=gene_pairs)
313
  tf_pairs_btn = gr.Button(value="Load & Plot")
314
  heatmap = gr.Plot(label="Heatmap")
315
-
316
  with gr.Column():
317
  segpair = gr.Dropdown(label="Seg pair")
318
  segpair_btn = gr.Button(value="Get PDB")
@@ -321,8 +256,10 @@ You can download specific segment pair PDB files by clicking 'Get PDB.'
321
 
322
  with gr.Row() as row:
323
  interact_plddt1 = gr.Plot(label="Interact pLDDT 1")
 
 
324
  interact_plddt2 = gr.Plot(label="Interact pLDDT 2")
325
-
326
  tf_pairs_btn.click(
327
  visualize_AF2,
328
  inputs=[tf_pairs, af],
@@ -339,7 +276,7 @@ You can download specific segment pair PDB files by clicking 'Get PDB.'
339
  )
340
  celltype_btn.click(
341
  load_and_plot_celltype,
342
- inputs=[celltype_name, gr.State(GET_CONFIG), cell],
343
  outputs=[gene_exp_plot, cell],
344
  )
345
  region_plot_btn.click(
@@ -352,9 +289,7 @@ You can download specific segment pair PDB files by clicking 'Get PDB.'
352
  inputs=[cell, gene_name_for_region, gr.State(motif)],
353
  outputs=[motif_plot, cell],
354
  )
355
- clustergram_btn.click(
356
- plot_motif_corr, inputs=[cell], outputs=[clustergram_plot, cell]
357
- )
358
  subnet_btn.click(
359
  plot_motif_subnet,
360
  inputs=[
@@ -367,4 +302,4 @@ You can download specific segment pair PDB files by clicking 'Get PDB.'
367
  outputs=[subnet_plot, cell],
368
  )
369
 
370
- demo.launch(server_name=args.host, share=args.share, server_port=args.port)
 
1
+ # Demo app
2
+ from pathlib import Path
3
 
4
  import gradio as gr
5
  import matplotlib.pyplot as plt
6
  import pandas as pd
 
 
 
7
  import s3fs
8
+ from genomespy import GenomeSpy
9
+
10
+ from gcell.cell.celltype import GETCellType
11
+ from gcell.config.config import load_config
12
+ from gcell.dna.nr_motif_v1 import NrMotifV1
13
+ from gcell.protein.af2 import AFPairseg
14
+ from gcell.utils.pdb_viewer import view_pdb_html
15
+
16
+ gs = GenomeSpy()
17
+
18
+ cfg = load_config("s3_interpret")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  plt.rcParams["figure.dpi"] = 100
20
 
21
+ if cfg.s3_uri: # Use S3 path if exists
22
  s3_file_sys = s3fs.S3FileSystem(anon=True)
23
+ cfg.celltype.data_dir = (
24
+ f"{cfg.s3_uri}/pretrain_human_bingren_shendure_apr2023/fetal_adult/"
 
25
  )
26
+ cfg.celltype.interpret_dir = (
27
+ f"{cfg.s3_uri}/Interpretation_all_hg38_allembed_v4_natac/"
28
  )
29
+ cfg.celltype.motif_dir = f"{cfg.s3_uri}/interpret_natac/motif-clustering/"
30
+ cfg.celltype.assets_dir = f"{cfg.s3_uri}/assets/"
31
  cell_type_annot = pd.read_csv(
32
+ cfg.celltype.data_dir.split("fetal_adult")[0]
33
+ + "data/cell_type_pretrain_human_bingren_shendure_apr2023.txt"
34
  )
35
  cell_type_id_to_name = dict(zip(cell_type_annot["id"], cell_type_annot["celltype"]))
36
  cell_type_name_to_id = dict(zip(cell_type_annot["celltype"], cell_type_annot["id"]))
37
  available_celltypes = sorted(
38
  [
39
  cell_type_id_to_name[f.split("/")[-1]]
40
+ for f in s3_file_sys.glob(cfg.celltype.interpret_dir + "*")
41
  ]
42
  )
43
+ gene_pairs = s3_file_sys.glob(f"{cfg.s3_uri}/structures/causal/*")
44
+ gene_pairs = [Path(pair).name for pair in gene_pairs]
45
+ motif = NrMotifV1.load_from_pickle()
46
+ else:
47
+ raise ValueError("S3 URI is required")
48
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  def visualize_AF2(tf_pair, a):
51
+ """
52
+ Visualize the AlphaFold2 structure of a transcription factor pair.
53
+ """
54
+ strcture_dir = f"{cfg.s3_uri}/structures/causal/{tf_pair}"
55
+ fasta_dir = f"{cfg.s3_uri}/sequences/causal/{tf_pair}"
56
+ a = AFPairseg(strcture_dir, fasta_dir, s3_file_sys=s3_file_sys)
 
 
 
 
 
 
57
  fig1 = a.plotly_plddt_gene1()
58
  fig2 = a.plotly_plddt_gene2()
59
  fig5, ax5 = a.plot_score_heatmap()
 
63
 
64
 
65
  def view_pdb(seg_pair, a):
66
+ """
67
+ View the PDB file of a transcription factor pair.
68
+ """
69
  pdb_path = a.pairs_data[seg_pair].pdb
70
+ if cfg.s3_uri:
71
+ bucket_name = f"{cfg.s3_uri}".split("//")[1].split("/")[0]
72
  path_in_bucket = pdb_path.split("/", 1)[1]
73
  file_name = pdb_path.split("/")[-1]
74
  output_path = f"https://{bucket_name}.s3.amazonaws.com/{path_in_bucket}"
 
76
  ### Download PDB
77
  [{file_name}]({output_path})
78
  """
79
+ else: # No download link if running locally
80
  output_text = ""
81
+ return view_pdb_html(pdb_path, s3_file_sys=s3_file_sys), a, output_text
82
 
83
 
84
  def update_dropdown(x, label):
85
+ """
86
+ Update the dropdown menu.
87
+ """
88
+ return gr.Dropdown(choices=x, label=label, interactive=True)
89
 
90
 
91
+ def load_and_plot_celltype(celltype_name, GET_CONFIG, cell, s3_file_sys=s3_file_sys):
92
+ """
93
+ Load and plot the gene expression of a cell type.
94
+ """
95
  celltype_id = cell_type_name_to_id[celltype_name]
96
+ cell = GETCellType(celltype_id, GET_CONFIG, s3_file_sys=s3_file_sys)
97
  cell.celltype_name = celltype_name
98
  gene_exp_fig = cell.plotly_gene_exp()
99
  return gene_exp_fig, cell
100
 
101
 
102
+ def plot_gene_regions(cell, gene_name, plotly: bool = True):
103
+ """
104
+ Plot the important regions of a gene.
105
+ """
106
  return cell.plot_gene_regions(gene_name, plotly=plotly), cell
107
 
108
 
109
+ def plot_gene_motifs(cell, gene_name, motif, overwrite: bool = False):
110
+ """
111
+ Plot the gene motifs of a gene.
112
+ """
113
  return cell.plot_gene_motifs(gene_name, motif, overwrite=overwrite)[0], cell
114
 
115
 
116
+ def plot_motif_subnet(
117
+ cell, motif_collection, m, type: str = "neighbors", threshold: float = 0.1
118
+ ):
119
+ """
120
+ Plot the motif subnet of a motif.
121
+ """
122
  return (
123
  cell.plotly_motif_subnet(motif_collection, m, type=type, threshold=threshold),
124
  cell,
125
  )
126
 
127
 
128
+ def plot_gene_exp(cell, plotly: bool = True):
129
+ """
130
+ Plot the gene expression of a cell type.
131
+ """
132
  return cell.plotly_gene_exp(plotly=plotly), cell
133
 
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  if __name__ == "__main__":
136
  with gr.Blocks(theme="sudeepshouche/minimalist") as demo:
137
  seg_pairs = gr.State([""])
 
139
  cell = gr.State(None)
140
 
141
  gr.Markdown(
142
+ """# A Foundation Model of Transcription Across Human Cell Types
143
+ This is a demo of the results of the GET model.
 
 
 
 
 
 
 
144
 
145
+ Checkout our [paper](https://www.nature.com/articles/s41586-024-08391-z), [model package](https://github.com/GET-Foundation/get_model)
146
+ and [analysis package](https://github.com/GET-Foundation/gcell) for more details.
147
 
148
+ Pretrained models, training data, infered structures and regulatory information are hosted on a public [S3 bucket](s3://2023-get-xf2217/get_demo)
 
 
 
 
 
 
149
  """
150
  )
151
 
 
156
  """
157
  ## 🔍 Prediction performance
158
 
159
+ This section enables you to select different cell types and generates a plot that compares observed
160
+ gene expression levels to predicted ones. It's important to note that for cell types without available
161
+ observed gene expression data, the plot will display a vertical line at 0, indicating the absence of
162
+ empirical expression data for those particular cell types. This visualization helps assess the accuracy
163
+ of gene expression predictions in the context of different cell types.
164
  """
165
  )
166
  celltype_name = gr.Dropdown(
167
+ label="Cell Type",
168
+ choices=available_celltypes,
169
+ value="Fetal Astrocyte 1",
170
  )
171
  celltype_btn = gr.Button(value="Load & plot gene expression")
172
+ gene_exp_plot = gr.Plot(
173
+ label="Gene expression prediction vs observation"
174
+ )
175
 
176
  # Right column: Plot gene motifs
177
  with gr.Column():
178
  gr.Markdown(
179
  """
180
+ ## 🧬 Cell-type specific regulatory inference
181
 
182
+ In this section, you can choose a specific gene and access visualizations of its cell-type specific regulatory
183
+ regions and motifs that promote gene expression. When you hover over the highlighted regions (the top 10%),
184
+ you'll be able to view information about the motifs present in those regions and their corresponding scores.
185
+ This feature allows for a detailed exploration of the regulatory elements influencing the expression of the selected gene.
186
  """
187
  )
188
  gene_name_for_region = gr.Textbox(
 
197
 
198
  gr.Markdown(
199
  """
200
+ ## 🔗 Causal discovery on motif-motif interactions
201
+ This section allows you to explore the inferred (using [LiNGAM](https://jmlr.org/papers/volume7/shimizu06a/shimizu06a.pdf))
202
+ relationships between motifs in the selected cell type.
 
 
 
 
203
  """
204
  )
 
 
 
 
205
 
206
+ with gr.Row() as row:
207
+ motif_for_subnet = gr.Dropdown(
208
+ label="Motif causal subnetwork",
209
+ choices=motif.cluster_names,
210
+ value="KLF/SP/2",
211
+ )
212
+ subnet_type = gr.Dropdown(
213
+ label="Interaction type",
214
+ choices=["neighbors", "parents", "children"],
215
+ value="neighbors",
216
+ )
217
+ # slider for threshold 0.01-0.2
218
+ subnet_threshold = gr.Slider(
219
+ label="Threshold",
220
+ minimum=0.01,
221
+ maximum=0.25,
222
+ step=0.01,
223
+ value=0.1,
224
+ )
225
+ subnet_btn = gr.Button(value="Plot Motif Causal Subnetwork")
226
+ subnet_plot = gr.Plot(label="Motif Causal Subnetwork")
227
 
228
  gr.Markdown(
229
  """
230
  ## 🔬 Structural atlas of TF-TF and TF-EP300 interactions
231
 
232
  This section allows you to explore transcription factor pairs within a causal network. You can visualize metrics like Heatmaps and pLDDT (predicted Local Distance Difference Test) for both proteins in the pair.
 
233
  The first row displays the pLDDT segmentation plot for the two TFs, helping to identify protein disorder regions. Each TF is divided into disordered and ordered segments labeled numerically as ZFX_0, ZFX_1, etc., with disordered segments marked in red. Uniprot annotations are included if available.
 
234
  The second row shows the interaction pLDDT plot. It compares pLDDT scores between segment pairs from AlphaFold2 predictions, indicating regions stabilized by TF interactions.
 
235
  The third row presents a heatmap plot, including:
 
236
  - *Interchain min pAE*: lower scores indicate stronger protein-protein interactions.
237
  - *Mean pLDDT*: higher scores signify greater prediction confidence or (inverse-)disorderness.
238
  - *ipTM*: higher scores reflect better predicted interaction quality by AlphaFold2.
 
242
  """
243
  )
244
 
 
245
  with gr.Row() as row:
246
  with gr.Column():
247
  tf_pairs = gr.Dropdown(label="TF pair", choices=gene_pairs)
248
  tf_pairs_btn = gr.Button(value="Load & Plot")
249
  heatmap = gr.Plot(label="Heatmap")
250
+
251
  with gr.Column():
252
  segpair = gr.Dropdown(label="Seg pair")
253
  segpair_btn = gr.Button(value="Get PDB")
 
256
 
257
  with gr.Row() as row:
258
  interact_plddt1 = gr.Plot(label="Interact pLDDT 1")
259
+
260
+ with gr.Row() as row:
261
  interact_plddt2 = gr.Plot(label="Interact pLDDT 2")
262
+
263
  tf_pairs_btn.click(
264
  visualize_AF2,
265
  inputs=[tf_pairs, af],
 
276
  )
277
  celltype_btn.click(
278
  load_and_plot_celltype,
279
+ inputs=[celltype_name, gr.State(cfg), cell],
280
  outputs=[gene_exp_plot, cell],
281
  )
282
  region_plot_btn.click(
 
289
  inputs=[cell, gene_name_for_region, gr.State(motif)],
290
  outputs=[motif_plot, cell],
291
  )
292
+
 
 
293
  subnet_btn.click(
294
  plot_motif_subnet,
295
  inputs=[
 
302
  outputs=[subnet_plot, cell],
303
  )
304
 
305
+ demo.launch(server_name=cfg.host, share=cfg.share, server_port=cfg.port)
modules/atac_rna_data_processing DELETED
@@ -1 +0,0 @@
1
- Subproject commit ef20c33c5fc3e2e1d4bda694d01ee88ff53dd38c
 
 
modules/proscope DELETED
@@ -1 +0,0 @@
1
- Subproject commit 17ad0359acb89c13fd1fc8cd0149c505d21f78d3