Skip to content

Commit a6fc994

Browse files
hevans66acashmoney
andauthored
[LAB-470] 442 tool author attribution to iojson (#599)
Co-authored-by: Aakaash Meduri <aakaash.meduri@gmail.com>
1 parent 4604cce commit a6fc994

20 files changed

+180
-154
lines changed
+40-36
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,43 @@
11
{
2-
"class": "CommandLineTool",
3-
"name": "equibind",
4-
"description": "Docking of small molecules to a protein",
5-
"baseCommand": ["/bin/bash", "-c"],
6-
"arguments": [
7-
"python main.py --protein $(inputs.protein.filepath) --small_molecule_library $(inputs.small_molecule.filepath);",
8-
"mv /outputs/ligands_predicted.sdf /outputs/$(inputs.protein.basename)_$(inputs.small_molecule.basename)_docked.$(inputs.small_molecule.ext);",
9-
"cp $(inputs.protein.filepath) /outputs/;",
10-
"rmdir /outputs/dummy;"
11-
],
12-
"dockerPull": "ghcr.io/labdao/equibind@sha256:ae2cec63b3924774727ed1c6c8af95cf4aaea2d3f0c5acbec56478505ccb2b07",
13-
"gpuBool": false,
14-
"networkBool": false,
15-
"inputs": {
16-
"protein": {
17-
"type": "File",
18-
"item": "",
19-
"glob": ["*.pdb"]
20-
},
21-
"small_molecule": {
22-
"type": "File",
23-
"item": "",
24-
"glob": ["*.sdf", "*.mol2"]
25-
}
2+
"class": "CommandLineTool",
3+
"name": "equibind",
4+
"description": "Docking of small molecules to a protein",
5+
"author": "@misc{stärk2022equibind,\n title={EquiBind: Geometric Deep Learning for Drug Binding Structure Prediction}, \n author={Hannes Stärk and Octavian-Eugen Ganea and Lagnajit Pattanaik and Regina Barzilay and Tommi Jaakkola},\n year={2022},\n eprint={2202.05146},\n archivePrefix={arXiv},\n primaryClass={q-bio.BM}\n}",
6+
"baseCommand": ["/bin/bash", "-c"],
7+
"arguments": [
8+
"mkdir -p /tmp-inputs/tmp;",
9+
"mkdir -p /tmp-outputs/tmp;",
10+
"cp /inputs/* /tmp-inputs/tmp/;",
11+
"ls /tmp-inputs/tmp;",
12+
"cd /src && python /src/inference.py --config=/src/configs_clean/bacalhau.yml;",
13+
"mv /tmp-outputs/tmp/* /outputs/;",
14+
"mv /outputs/lig_equibind_corrected.sdf /outputs/$(inputs.protein.basename)_$(inputs.small_molecule.basename)_docked.$(inputs.small_molecule.ext);",
15+
"mv /tmp-inputs/tmp/*.pdb /outputs/;"],
16+
"dockerPull": "ghcr.io/labdao/equibind:main@sha256:21a381d9ab1ff047565685044569c8536a55e489c9531326498b28d6b3cc244f",
17+
"gpuBool": false,
18+
"networkBool": false,
19+
"inputs": {
20+
"protein": {
21+
"type": "File",
22+
"item": "",
23+
"glob": ["*.pdb"]
2624
},
27-
"outputs": {
28-
"best_docked_small_molecule": {
29-
"type": "File",
30-
"item": "",
31-
"glob": ["*_docked.sdf"]
32-
},
33-
"protein": {
34-
"type": "File",
35-
"item": "",
36-
"glob": ["*.pdb"]
37-
}
25+
"small_molecule": {
26+
"type": "File",
27+
"item": "",
28+
"glob": ["*.sdf", "*.mol2"]
3829
}
39-
}
30+
},
31+
"outputs": {
32+
"best_docked_small_molecule": {
33+
"type": "File",
34+
"item": "",
35+
"glob": ["*_docked.sdf", "*_docked.mol2"]
36+
},
37+
"protein": {
38+
"type": "File",
39+
"item": "",
40+
"glob": ["*.pdb"]
41+
}
42+
}
43+
}

internal/ipwl/tool.go

+1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ type ToolOutput struct {
2828
type Tool struct {
2929
Name string `json:"name"`
3030
Description string `json:"description"`
31+
Author string `json:"author"`
3132
BaseCommand []string `json:"baseCommand"`
3233
Arguments []string `json:"arguments"`
3334
DockerPull string `json:"dockerPull"`

internal/ipwl/tool_test.go

+11-6
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,20 @@ func TestReadToolConfig(t *testing.T) {
99
filePath := "testdata/example_tool.json"
1010
expected := Tool{
1111
Name: "equibind",
12+
Author: "@misc{stärk2022equibind,\n title={EquiBind: Geometric Deep Learning for Drug Binding Structure Prediction}, \n author={Hannes Stärk and Octavian-Eugen Ganea and Lagnajit Pattanaik and Regina Barzilay and Tommi Jaakkola},\n year={2022},\n eprint={2202.05146},\n archivePrefix={arXiv},\n primaryClass={q-bio.BM}\n}",
1213
Description: "Docking of small molecules to a protein",
1314
BaseCommand: []string{"/bin/bash", "-c"},
1415
Arguments: []string{
15-
"python main.py --protein $(inputs.protein.filepath) --small_molecule_library $(inputs.small_molecule.filepath);",
16-
"mv /outputs/ligands_predicted.sdf /outputs/$(inputs.protein.basename)_$(inputs.small_molecule.basename)_docked.$(inputs.small_molecule.ext);",
17-
"cp $(inputs.protein.filepath) /outputs/;",
18-
"rmdir /outputs/dummy;",
16+
"mkdir -p /tmp-inputs/tmp;",
17+
"mkdir -p /tmp-outputs/tmp;",
18+
"cp /inputs/* /tmp-inputs/tmp/;",
19+
"ls /tmp-inputs/tmp;",
20+
"cd /src && python /src/inference.py --config=/src/configs_clean/bacalhau.yml;",
21+
"mv /tmp-outputs/tmp/* /outputs/;",
22+
"mv /outputs/lig_equibind_corrected.sdf /outputs/$(inputs.protein.basename)_$(inputs.small_molecule.basename)_docked.$(inputs.small_molecule.ext);",
23+
"mv /tmp-inputs/tmp/*.pdb /outputs/;",
1924
},
20-
DockerPull: "ghcr.io/labdao/equibind@sha256:ae2cec63b3924774727ed1c6c8af95cf4aaea2d3f0c5acbec56478505ccb2b07",
25+
DockerPull: "ghcr.io/labdao/equibind:main@sha256:21a381d9ab1ff047565685044569c8536a55e489c9531326498b28d6b3cc244f",
2126
GpuBool: false,
2227
Inputs: map[string]ToolInput{
2328
"protein": {
@@ -32,7 +37,7 @@ func TestReadToolConfig(t *testing.T) {
3237
Outputs: map[string]ToolOutput{
3338
"best_docked_small_molecule": {
3439
Type: "File",
35-
Glob: []string{"*_docked.sdf"},
40+
Glob: []string{"*_docked.sdf", "*_docked.mol2"},
3641
},
3742
"protein": {
3843
Type: "File",

python/src/plex/__init__.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,14 @@ class ScatteringMethod(Enum):
1313

1414

1515
class CoreTools(Enum):
16-
EQUIBIND = "QmZ2HarAgwZGjc3LBx9mWNwAQkPWiHMignqKup1ckp8NhB"
17-
DIFFDOCK = "QmSzetFkveiQYZ5FgpZdHHfsjMWYz5YzwMAvqUgUFhFPMM"
16+
EQUIBIND = "QmZWYpZXsrbtzvBCHngh4YEgME5djnV5EedyTpc8DrK7k2"
17+
DIFFDOCK = "QmfKhJh48aDHgckzwGEASNmZd1SYstQiR5qLqqYmLQFzq9"
1818
COLABFOLD_MINI = "QmcRH74qfqDBJFku3mEDGxkAf6CSpaHTpdbe1pMkHnbcZD"
1919
COLABFOLD_STANDARD = "QmXnM1VpdGgX5huyU3zTjJovsu42KPfWhjxhZGkyvy9PVk"
2020
COLABFOLD_LARGE = "QmPYqMy19VFFuYztL6b5ruo4Kw4JWT583emStGrSYTH5Yi"
2121
BAM2FASTQ = "QmbPUirWiWCv9sgdHLekf5AnoCdw4QPU2SyfGGKs9JRRbq"
2222
ODDT = "QmUx7NdxkXXZvbK1JXZVUYUBqsevWkbVxgTzpWJ4Xp4inf"
23-
RFDIFFUSION = "QmXnCBCtoYuPyGsEJVpjn5regHfFSYa8kx44e22XxDX2t2"
23+
RFDIFFUSION = "QmTyFGjt2oqTLGQRE5u8mtfiQNft5nzMsieYdvwnpfk3HJ"
2424
REPEATMODELER = "QmZdXxnUt1sFFR39CfkEUgiioUBf6qP5CUs8TCb7Wqn4MC"
2525
GNINA = "QmZiQWEXj3aMRnJLoU39HHcknMDfKQD2txpfk6ubJAdDRx"
2626
BATCH_DLKCAT = "QmQTjvP2utNb1JTtUHeQ8mQPvNkCTg5VRc4LVdptWkUcJ7"

tools/bam2fastq.json

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"class": "CommandLineTool",
33
"name": "bam2fastq",
44
"description": "Sort BAM by qname and Extract Fasta reads R1 R2 with RG using samtools",
5+
"author": "",
56
"inputs": {
67
"genome": {
78
"type": "File",

tools/blender/blender.json

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"class": "CommandLineTool",
33
"name": "blender",
44
"description": "let's create some fancy protein graphics",
5+
"author": "",
56
"baseCommand": ["/bin/bash", "-c"],
67
"arguments": [
78
"blender --background --python app.py -- $(inputs.protein.filepath) /outputs/protein.png"

tools/colabfold-large.json

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"class": "CommandLineTool",
33
"name": "colabfold-large",
44
"description": "Protein folding prediction using Colabfold (large settings)",
5+
"author": "",
56
"baseCommand": ["/bin/bash", "-c"],
67
"arguments": [
78
"colabfold_batch --templates --num-recycle $(inputs.recycle.default) --use-gpu-relax --amber /inputs /outputs;"

tools/colabfold-mini.json

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"class": "CommandLineTool",
33
"name": "colabfold-mini",
44
"description": "Protein folding prediction using Colabfold (mini settings)",
5+
"author": "",
56
"baseCommand": ["/bin/bash", "-c"],
67
"arguments": [
78
"colabfold_batch --templates --max-msa 32:64 --num-recycle $(inputs.recycle.default) /inputs /outputs;"

tools/colabfold-standard.json

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"class": "CommandLineTool",
33
"name": "colabfold-standard",
44
"description": "Protein folding prediction using Colabfold (standard settings)",
5+
"author": "",
56
"baseCommand": ["/bin/bash", "-c"],
67
"arguments": [
78
"colabfold_batch --templates --num-recycle $(inputs.recycle.default) /inputs /outputs;"

tools/diffdock.json

+61-60
Original file line numberDiff line numberDiff line change
@@ -1,63 +1,64 @@
11
{
2-
"class": "CommandLineTool",
3-
"name": "diffdock",
4-
"description": "Docking of small molecules to a protein",
5-
"baseCommand": ["/bin/bash", "-c"],
6-
"arguments": [
7-
"python datasets/esm_embedding_preparation.py --protein_path $(inputs.protein.filepath) --out_file /outputs/prepared_for_esm.fasta;",
8-
"HOME=esm/model_weights python esm/scripts/extract.py esm2_t33_650M_UR50D /outputs/prepared_for_esm.fasta /outputs/esm2_output --repr_layers $(inputs.repr_layers.default) --include per_tok && cp -r /outputs/esm2_output data/esm2_output;",
9-
"python -m inference --protein_path $(inputs.protein.filepath) --ligand $(inputs.small_molecule.filepath) --out_dir /outputs --inference_steps $(inputs.inference_steps.default) --samples_per_complex $(inputs.samples_per_complex.default) --batch_size $(inputs.batch_size.default) --actual_steps $(inputs.actual_steps.default) --no_final_step_noise;",
10-
"cp $(inputs.protein.filepath) /outputs"
11-
],
12-
"dockerPull": "ghcr.io/labdao/diffdock:main@sha256:b00432de73478d3da578e4a16ee669178828109f3c7bf9c58d44bb7514f68629",
13-
"gpuBool": true,
14-
"networkBool": true,
15-
"memoryGB": 12,
16-
"inputs": {
17-
"protein": {
18-
"type": "File",
19-
"glob": ["*.pdb"]
20-
},
21-
"small_molecule": {
22-
"type": "File",
23-
"glob": ["*.sdf", "*.mol2"]
24-
},
25-
"repr_layers": {
26-
"type": "int",
27-
"default": "33"
28-
},
29-
"inference_steps": {
30-
"type": "int",
31-
"default": "20"
32-
},
33-
"samples_per_complex": {
34-
"type": "int",
35-
"default": "40"
36-
},
37-
"batch_size": {
38-
"type": "int",
39-
"default": "10"
40-
},
41-
"actual_steps": {
42-
"type": "int",
43-
"default": "18"
44-
}
45-
},
46-
"outputs": {
47-
"best_docked_small_molecule": {
48-
"type": "File",
49-
"item": "",
50-
"glob": ["index*/rank1.sdf"]
51-
},
52-
"all_docked_small_molecules": {
53-
"type": "Array",
54-
"item": "File",
55-
"glob": ["index*/rank*.sdf"]
56-
},
57-
"protein": {
58-
"type": "File",
59-
"item": "",
60-
"glob": ["*.pdb"]
61-
}
2+
"class": "CommandLineTool",
3+
"name": "diffdock",
4+
"description": "Docking of small molecules to a protein",
5+
"author": "@misc{corso2023diffdock,\n title={DiffDock: Diffusion Steps, Twists, and Turns for Molecular Docking},\n author={Gabriele Corso and Hannes Stärk and Bowen Jing and Regina Barzilay and Tommi Jaakkola},\n year={2023},\n eprint={2210.01776},\n archivePrefix={arXiv},\n primaryClass={q-bio.BM}\n}",
6+
"baseCommand": ["/bin/bash", "-c"],
7+
"arguments": [
8+
"python datasets/esm_embedding_preparation.py --protein_path $(inputs.protein.filepath) --out_file /outputs/prepared_for_esm.fasta;",
9+
"HOME=esm/model_weights python esm/scripts/extract.py esm2_t33_650M_UR50D /outputs/prepared_for_esm.fasta /outputs/esm2_output --repr_layers $(inputs.repr_layers.default) --include per_tok && cp -r /outputs/esm2_output data/esm2_output;",
10+
"python -m inference --protein_path $(inputs.protein.filepath) --ligand $(inputs.small_molecule.filepath) --out_dir /outputs --inference_steps $(inputs.inference_steps.default) --samples_per_complex $(inputs.samples_per_complex.default) --batch_size $(inputs.batch_size.default) --actual_steps $(inputs.actual_steps.default) --no_final_step_noise;",
11+
"cp $(inputs.protein.filepath) /outputs"
12+
],
13+
"dockerPull": "ghcr.io/labdao/diffdock:main@sha256:b00432de73478d3da578e4a16ee669178828109f3c7bf9c58d44bb7514f68629",
14+
"gpuBool": true,
15+
"networkBool": true,
16+
"memoryGB": 12,
17+
"inputs": {
18+
"protein": {
19+
"type": "File",
20+
"glob": ["*.pdb"]
21+
},
22+
"small_molecule": {
23+
"type": "File",
24+
"glob": ["*.sdf", "*.mol2"]
25+
},
26+
"repr_layers": {
27+
"type": "int",
28+
"default": "33"
29+
},
30+
"inference_steps": {
31+
"type": "int",
32+
"default": "20"
33+
},
34+
"samples_per_complex": {
35+
"type": "int",
36+
"default": "40"
37+
},
38+
"batch_size": {
39+
"type": "int",
40+
"default": "10"
41+
},
42+
"actual_steps": {
43+
"type": "int",
44+
"default": "18"
45+
}
46+
},
47+
"outputs": {
48+
"best_docked_small_molecule": {
49+
"type": "File",
50+
"item": "",
51+
"glob": ["index*/rank1.sdf"]
52+
},
53+
"all_docked_small_molecules": {
54+
"type": "Array",
55+
"item": "File",
56+
"glob": ["index*/rank*.sdf"]
57+
},
58+
"protein": {
59+
"type": "File",
60+
"item": "",
61+
"glob": ["*.pdb"]
6262
}
63+
}
6364
}

tools/dlkcat/batch_dlkcat.json

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"class": "CommandLineTool",
33
"name": "dlkcat",
44
"description": "batch predict enzyme catalytic activity from a protein sequence and molecule smile",
5+
"author": "",
56
"baseCommand": ["/bin/bash", "-c"],
67
"arguments": [
78
"conda run -n env python prediction_for_input.py $(inputs.input_tsv.filepath) && mv output.tsv /outputs/"

tools/equibind.json

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"class": "CommandLineTool",
33
"name": "equibind",
44
"description": "Docking of small molecules to a protein",
5+
"author": "@misc{stärk2022equibind,\n title={EquiBind: Geometric Deep Learning for Drug Binding Structure Prediction}, \n author={Hannes Stärk and Octavian-Eugen Ganea and Lagnajit Pattanaik and Regina Barzilay and Tommi Jaakkola},\n year={2022},\n eprint={2202.05146},\n archivePrefix={arXiv},\n primaryClass={q-bio.BM}\n}",
56
"baseCommand": ["/bin/bash", "-c"],
67
"arguments": [
78
"mkdir -p /tmp-inputs/tmp;",
@@ -39,4 +40,4 @@
3940
"glob": ["*.pdb"]
4041
}
4142
}
42-
}
43+
}

tools/fastqc/fastqc.json

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"class": "Tool",
33
"name": "fastqc",
44
"description": "Comprehensive quality control tool for high-throughput sequence data",
5+
"author": "",
56
"doi": "https://doi.org/10.48550/arXiv.2202.05146",
67
"baseCommand": ["/bin/bash", "-c"],
78
"arguments": [

tools/gnina/gnina.json

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"class": "CommandLineTool",
33
"name": "gnina",
44
"description": "Protein-ligand docking using Gnina",
5+
"author": "",
56
"baseCommand": ["/bin/bash", "-c"],
67
"arguments": [
78
"gnina -r $(inputs.protein.filepath) -l $(inputs.small_molecule.filepath) --exhaustiveness $(inputs.exhaustiveness.default) --autobox_ligand $(inputs.protein.filepath) --cnn_scoring $(inputs.cnn_scoring.default) -o /outputs/$(inputs.protein.basename)_$(inputs.small_molecule.basename)_docked_scored.sdf"

tools/oddt.json

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"class": "CommandLineTool",
33
"name": "oddt",
44
"description": "Scoring of protein-ligand complexes using ODDT",
5+
"author": "",
56
"baseCommand": ["/bin/bash", "-c"],
67
"arguments": [
78
"mkdir -p /tmp-out && oddt_cli $(inputs.small_molecule.filepath) --receptor $(inputs.protein.filepath) --score rfscore_v1 --score rfscore_v2 --score rfscore_v3 --score nnscore -O /tmp-out/$(inputs.protein.basename)_$(inputs.small_molecule.basename)_scored.$(inputs.small_molecule.ext) && cd /tmp-out && /app/aggregate_score.sh && cp /tmp-out/* /outputs"

tools/openbabel/pdb-to-sdf-openbabel.json

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"class": "CommandLineTool",
33
"name": "pdb to sdf",
44
"description": "Convert pdb to sdf using openbabel.",
5+
"author": "",
56
"baseCommand": ["/bin/bash", "-c"],
67
"arguments": [
78
"obabel $(inputs.pdb_file.filepath) -O /outputs/$(inputs.pdb_file.basename).sdf;"

tools/openbabel/rmsd-openbabel.json

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"class": "CommandLineTool",
33
"name": "rmsd",
44
"description": "calculate the RMSD of a reference small molecule and a docked small molecule using openbabel.",
5+
"author": "",
56
"baseCommand": ["/bin/bash", "-c"],
67
"arguments": [
78
"echo 'reference,comparison,RMSD' > /outputs/rmsd.csv && echo -n '$(inputs.reference_structure.basename),$(inputs.comparison_structure.basename),' > /outputs/temp.csv && obrms -firstonly $(inputs.reference_structure.filepath) $(inputs.comparison_structure.filepath) | awk '{print $2}' | tr -d '\\n' >> /outputs/temp.csv && cat /outputs/temp.csv >> /outputs/rmsd.csv && rm /outputs/temp.csv;"

tools/protbert/protbert.json

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"class": "CommandLineTool",
33
"name": "protbert",
44
"description": "Predicting unknown residues with protein language models",
5+
"author": "",
56
"baseCommand": ["/bin/bash", "-c"],
67
"arguments": [
78
"python3 app.py $(inputs.protein_sequence.filepath) /outputs --mode fill-mask;",

0 commit comments

Comments
 (0)