[LAB-470] 442 tool author attribution to iojson (#599)

hevans66 · acashmoney · web-flow · commit a6fc994663cd · 2023-08-18T18:39:43.000-04:00
Co-authored-by: Aakaash Meduri &lt;aakaash.meduri@gmail.com&gt;
diff --git a/internal/ipwl/testdata/example_tool.json b/internal/ipwl/testdata/example_tool.json
@@ -1,39 +1,43 @@
 {
-    "class": "CommandLineTool",
-    "name": "equibind",
-    "description": "Docking of small molecules to a protein",
-    "baseCommand": ["/bin/bash", "-c"],
-    "arguments": [
-      "python main.py --protein $(inputs.protein.filepath) --small_molecule_library $(inputs.small_molecule.filepath);",
-      "mv /outputs/ligands_predicted.sdf /outputs/$(inputs.protein.basename)_$(inputs.small_molecule.basename)_docked.$(inputs.small_molecule.ext);",
-      "cp $(inputs.protein.filepath) /outputs/;",
-      "rmdir /outputs/dummy;"
-    ],
-    "dockerPull": "ghcr.io/labdao/equibind@sha256:ae2cec63b3924774727ed1c6c8af95cf4aaea2d3f0c5acbec56478505ccb2b07",
-    "gpuBool": false,
-    "networkBool": false,
-    "inputs": {
-      "protein": {
-        "type": "File",
-        "item": "",
-        "glob": ["*.pdb"]
-      },
-      "small_molecule": {
-        "type": "File",
-        "item": "",
-        "glob": ["*.sdf", "*.mol2"]
-      }
+  "class": "CommandLineTool",
+  "name": "equibind",
+  "description": "Docking of small molecules to a protein",
+  "author": "@misc{stärk2022equibind,\n      title={EquiBind: Geometric Deep Learning for Drug Binding Structure Prediction}, \n      author={Hannes Stärk and Octavian-Eugen Ganea and Lagnajit Pattanaik and Regina Barzilay and Tommi Jaakkola},\n      year={2022},\n      eprint={2202.05146},\n      archivePrefix={arXiv},\n      primaryClass={q-bio.BM}\n}",
+  "baseCommand": ["/bin/bash", "-c"],
+  "arguments": [
+    "mkdir -p /tmp-inputs/tmp;",
+    "mkdir -p /tmp-outputs/tmp;",
+    "cp /inputs/* /tmp-inputs/tmp/;",
+    "ls /tmp-inputs/tmp;",
+    "cd /src && python /src/inference.py --config=/src/configs_clean/bacalhau.yml;",
+    "mv /tmp-outputs/tmp/* /outputs/;",
+    "mv /outputs/lig_equibind_corrected.sdf /outputs/$(inputs.protein.basename)_$(inputs.small_molecule.basename)_docked.$(inputs.small_molecule.ext);",
+    "mv /tmp-inputs/tmp/*.pdb /outputs/;"],
+  "dockerPull": "ghcr.io/labdao/equibind:main@sha256:21a381d9ab1ff047565685044569c8536a55e489c9531326498b28d6b3cc244f",
+  "gpuBool": false,
+  "networkBool": false,
+  "inputs": {
+    "protein": {
+      "type": "File",
+      "item": "",
+      "glob": ["*.pdb"]
     },
-    "outputs": {
-      "best_docked_small_molecule": {
-        "type": "File",
-        "item": "",
-        "glob": ["*_docked.sdf"]
-      },
-      "protein": {
-        "type": "File", 
-        "item": "",
-        "glob": ["*.pdb"]
-      }
+    "small_molecule": {
+      "type": "File",
+      "item": "",
+      "glob": ["*.sdf", "*.mol2"]
     }
-}
+  },
+  "outputs": {
+    "best_docked_small_molecule": {
+      "type": "File",
+      "item": "",
+      "glob": ["*_docked.sdf", "*_docked.mol2"]
+    },
+    "protein": {
+      "type": "File", 
+      "item": "",
+      "glob": ["*.pdb"]
+    }
+  }
+}
diff --git a/internal/ipwl/tool.go b/internal/ipwl/tool.go
@@ -28,6 +28,7 @@ type ToolOutput struct {
 type Tool struct {
 	Name        string                `json:"name"`
 	Description string                `json:"description"`
+	Author      string                `json:"author"`
 	BaseCommand []string              `json:"baseCommand"`
 	Arguments   []string              `json:"arguments"`
 	DockerPull  string                `json:"dockerPull"`
diff --git a/internal/ipwl/tool_test.go b/internal/ipwl/tool_test.go
@@ -9,15 +9,20 @@ func TestReadToolConfig(t *testing.T) {
 	filePath := "testdata/example_tool.json"
 	expected := Tool{
 		Name:        "equibind",
+		Author:      "@misc{stärk2022equibind,\n      title={EquiBind: Geometric Deep Learning for Drug Binding Structure Prediction}, \n      author={Hannes Stärk and Octavian-Eugen Ganea and Lagnajit Pattanaik and Regina Barzilay and Tommi Jaakkola},\n      year={2022},\n      eprint={2202.05146},\n      archivePrefix={arXiv},\n      primaryClass={q-bio.BM}\n}",
 		Description: "Docking of small molecules to a protein",
 		BaseCommand: []string{"/bin/bash", "-c"},
 		Arguments: []string{
-			"python main.py --protein $(inputs.protein.filepath) --small_molecule_library $(inputs.small_molecule.filepath);",
-			"mv /outputs/ligands_predicted.sdf /outputs/$(inputs.protein.basename)_$(inputs.small_molecule.basename)_docked.$(inputs.small_molecule.ext);",
-			"cp $(inputs.protein.filepath) /outputs/;",
-			"rmdir /outputs/dummy;",
+			"mkdir -p /tmp-inputs/tmp;",
+			"mkdir -p /tmp-outputs/tmp;",
+			"cp /inputs/* /tmp-inputs/tmp/;",
+			"ls /tmp-inputs/tmp;",
+			"cd /src && python /src/inference.py --config=/src/configs_clean/bacalhau.yml;",
+			"mv /tmp-outputs/tmp/* /outputs/;",
+			"mv /outputs/lig_equibind_corrected.sdf /outputs/$(inputs.protein.basename)_$(inputs.small_molecule.basename)_docked.$(inputs.small_molecule.ext);",
+			"mv /tmp-inputs/tmp/*.pdb /outputs/;",
 		},
-		DockerPull: "ghcr.io/labdao/equibind@sha256:ae2cec63b3924774727ed1c6c8af95cf4aaea2d3f0c5acbec56478505ccb2b07",
+		DockerPull: "ghcr.io/labdao/equibind:main@sha256:21a381d9ab1ff047565685044569c8536a55e489c9531326498b28d6b3cc244f",
 		GpuBool:    false,
 		Inputs: map[string]ToolInput{
 			"protein": {
@@ -32,7 +37,7 @@ func TestReadToolConfig(t *testing.T) {
 		Outputs: map[string]ToolOutput{
 			"best_docked_small_molecule": {
 				Type: "File",
-				Glob: []string{"*_docked.sdf"},
+				Glob: []string{"*_docked.sdf", "*_docked.mol2"},
 			},
 			"protein": {
 				Type: "File",
diff --git a/python/src/plex/__init__.py b/python/src/plex/__init__.py
@@ -13,14 +13,14 @@ class ScatteringMethod(Enum):
 
 
 class CoreTools(Enum):
-    EQUIBIND = "QmZ2HarAgwZGjc3LBx9mWNwAQkPWiHMignqKup1ckp8NhB"
-    DIFFDOCK = "QmSzetFkveiQYZ5FgpZdHHfsjMWYz5YzwMAvqUgUFhFPMM"
+    EQUIBIND = "QmZWYpZXsrbtzvBCHngh4YEgME5djnV5EedyTpc8DrK7k2"
+    DIFFDOCK = "QmfKhJh48aDHgckzwGEASNmZd1SYstQiR5qLqqYmLQFzq9"
     COLABFOLD_MINI = "QmcRH74qfqDBJFku3mEDGxkAf6CSpaHTpdbe1pMkHnbcZD"
     COLABFOLD_STANDARD = "QmXnM1VpdGgX5huyU3zTjJovsu42KPfWhjxhZGkyvy9PVk"
     COLABFOLD_LARGE = "QmPYqMy19VFFuYztL6b5ruo4Kw4JWT583emStGrSYTH5Yi"
     BAM2FASTQ = "QmbPUirWiWCv9sgdHLekf5AnoCdw4QPU2SyfGGKs9JRRbq"
     ODDT = "QmUx7NdxkXXZvbK1JXZVUYUBqsevWkbVxgTzpWJ4Xp4inf"
-    RFDIFFUSION = "QmXnCBCtoYuPyGsEJVpjn5regHfFSYa8kx44e22XxDX2t2"
+    RFDIFFUSION = "QmTyFGjt2oqTLGQRE5u8mtfiQNft5nzMsieYdvwnpfk3HJ"
     REPEATMODELER = "QmZdXxnUt1sFFR39CfkEUgiioUBf6qP5CUs8TCb7Wqn4MC"
     GNINA = "QmZiQWEXj3aMRnJLoU39HHcknMDfKQD2txpfk6ubJAdDRx"
     BATCH_DLKCAT = "QmQTjvP2utNb1JTtUHeQ8mQPvNkCTg5VRc4LVdptWkUcJ7"
diff --git a/tools/bam2fastq.json b/tools/bam2fastq.json
@@ -2,6 +2,7 @@
     "class": "CommandLineTool",
     "name": "bam2fastq",
     "description": "Sort BAM by qname and Extract Fasta reads R1 R2 with RG using samtools",
+    "author": "",
     "inputs": {
         "genome": {
             "type": "File",
diff --git a/tools/blender/blender.json b/tools/blender/blender.json
@@ -2,6 +2,7 @@
     "class": "CommandLineTool",
     "name": "blender",
     "description": "let's create some fancy protein graphics",
+    "author": "",
     "baseCommand": ["/bin/bash", "-c"],
     "arguments": [
         "blender --background --python app.py -- $(inputs.protein.filepath) /outputs/protein.png"
diff --git a/tools/colabfold-large.json b/tools/colabfold-large.json
@@ -2,6 +2,7 @@
     "class": "CommandLineTool",
     "name": "colabfold-large",
     "description": "Protein folding prediction using Colabfold (large settings)",
+    "author": "",
     "baseCommand": ["/bin/bash", "-c"],
     "arguments": [
       "colabfold_batch --templates --num-recycle $(inputs.recycle.default) --use-gpu-relax --amber /inputs /outputs;"
diff --git a/tools/colabfold-mini.json b/tools/colabfold-mini.json
@@ -2,6 +2,7 @@
     "class": "CommandLineTool",
     "name": "colabfold-mini",
     "description": "Protein folding prediction using Colabfold (mini settings)",
+    "author": "",
     "baseCommand": ["/bin/bash", "-c"],
     "arguments": [
       "colabfold_batch --templates --max-msa 32:64 --num-recycle $(inputs.recycle.default) /inputs /outputs;"
diff --git a/tools/colabfold-standard.json b/tools/colabfold-standard.json
@@ -2,6 +2,7 @@
     "class": "CommandLineTool",
     "name": "colabfold-standard",
     "description": "Protein folding prediction using Colabfold (standard settings)",
+    "author": "",
     "baseCommand": ["/bin/bash", "-c"],
     "arguments": [
       "colabfold_batch --templates --num-recycle $(inputs.recycle.default) /inputs /outputs;"
diff --git a/tools/diffdock.json b/tools/diffdock.json
@@ -1,63 +1,64 @@
 {
-    "class": "CommandLineTool",
-    "name": "diffdock",
-    "description": "Docking of small molecules to a protein",
-    "baseCommand": ["/bin/bash", "-c"],
-    "arguments": [
-      "python datasets/esm_embedding_preparation.py --protein_path $(inputs.protein.filepath) --out_file /outputs/prepared_for_esm.fasta;",
-      "HOME=esm/model_weights python esm/scripts/extract.py esm2_t33_650M_UR50D /outputs/prepared_for_esm.fasta /outputs/esm2_output --repr_layers $(inputs.repr_layers.default) --include per_tok && cp -r /outputs/esm2_output data/esm2_output;",
-      "python -m inference --protein_path $(inputs.protein.filepath) --ligand $(inputs.small_molecule.filepath) --out_dir /outputs --inference_steps $(inputs.inference_steps.default) --samples_per_complex $(inputs.samples_per_complex.default) --batch_size $(inputs.batch_size.default) --actual_steps $(inputs.actual_steps.default) --no_final_step_noise;",
-      "cp $(inputs.protein.filepath) /outputs"
-    ],
-    "dockerPull": "ghcr.io/labdao/diffdock:main@sha256:b00432de73478d3da578e4a16ee669178828109f3c7bf9c58d44bb7514f68629",
-    "gpuBool": true,
-    "networkBool": true,
-    "memoryGB": 12,
-    "inputs": {
-      "protein": {
-        "type": "File",
-        "glob": ["*.pdb"]
-      },
-      "small_molecule": {
-        "type": "File",
-        "glob": ["*.sdf", "*.mol2"]
-      },
-      "repr_layers": {
-        "type": "int",
-        "default": "33"
-      },
-      "inference_steps": {
-        "type": "int",
-        "default": "20"
-      },
-      "samples_per_complex": {
-        "type": "int",
-        "default": "40"
-      },
-      "batch_size": {
-        "type": "int",
-        "default": "10"
-      },
-      "actual_steps": {
-        "type": "int",
-        "default": "18"
-      }
-    },
-    "outputs": {
-      "best_docked_small_molecule": {
-        "type": "File",
-        "item": "",
-        "glob": ["index*/rank1.sdf"]
-      },
-      "all_docked_small_molecules": {
-        "type": "Array",
-        "item": "File",
-        "glob": ["index*/rank*.sdf"]
-      },
-      "protein": {
-        "type": "File",
-        "item": "",
-        "glob": ["*.pdb"]
-      }
+  "class": "CommandLineTool",
+  "name": "diffdock",
+  "description": "Docking of small molecules to a protein",
+  "author": "@misc{corso2023diffdock,\n      title={DiffDock: Diffusion Steps, Twists, and Turns for Molecular Docking},\n      author={Gabriele Corso and Hannes Stärk and Bowen Jing and Regina Barzilay and Tommi Jaakkola},\n      year={2023},\n      eprint={2210.01776},\n      archivePrefix={arXiv},\n      primaryClass={q-bio.BM}\n}",
+  "baseCommand": ["/bin/bash", "-c"],
+  "arguments": [
+    "python datasets/esm_embedding_preparation.py --protein_path $(inputs.protein.filepath) --out_file /outputs/prepared_for_esm.fasta;",
+    "HOME=esm/model_weights python esm/scripts/extract.py esm2_t33_650M_UR50D /outputs/prepared_for_esm.fasta /outputs/esm2_output --repr_layers $(inputs.repr_layers.default) --include per_tok && cp -r /outputs/esm2_output data/esm2_output;",
+    "python -m inference --protein_path $(inputs.protein.filepath) --ligand $(inputs.small_molecule.filepath) --out_dir /outputs --inference_steps $(inputs.inference_steps.default) --samples_per_complex $(inputs.samples_per_complex.default) --batch_size $(inputs.batch_size.default) --actual_steps $(inputs.actual_steps.default) --no_final_step_noise;",
+    "cp $(inputs.protein.filepath) /outputs"
+  ],
+  "dockerPull": "ghcr.io/labdao/diffdock:main@sha256:b00432de73478d3da578e4a16ee669178828109f3c7bf9c58d44bb7514f68629",
+  "gpuBool": true,
+  "networkBool": true,
+  "memoryGB": 12,
+  "inputs": {
+    "protein": {
+      "type": "File",
+      "glob": ["*.pdb"]
+    },
+    "small_molecule": {
+      "type": "File",
+      "glob": ["*.sdf", "*.mol2"]
+    },
+    "repr_layers": {
+      "type": "int",
+      "default": "33"
+    },
+    "inference_steps": {
+      "type": "int",
+      "default": "20"
+    },
+    "samples_per_complex": {
+      "type": "int",
+      "default": "40"
+    },
+    "batch_size": {
+      "type": "int",
+      "default": "10"
+    },
+    "actual_steps": {
+      "type": "int",
+      "default": "18"
+    }
+  },
+  "outputs": {
+    "best_docked_small_molecule": {
+      "type": "File",
+      "item": "",
+      "glob": ["index*/rank1.sdf"]
+    },
+    "all_docked_small_molecules": {
+      "type": "Array",
+      "item": "File",
+      "glob": ["index*/rank*.sdf"]
+    },
+    "protein": {
+      "type": "File",
+      "item": "",
+      "glob": ["*.pdb"]
     }
+  }
 }
diff --git a/tools/dlkcat/batch_dlkcat.json b/tools/dlkcat/batch_dlkcat.json
@@ -2,6 +2,7 @@
     "class": "CommandLineTool",
     "name": "dlkcat",
     "description": "batch predict enzyme catalytic activity from a protein sequence and molecule smile",
+    "author": "",
     "baseCommand": ["/bin/bash", "-c"],
     "arguments": [
         "conda run -n env python prediction_for_input.py $(inputs.input_tsv.filepath) && mv output.tsv /outputs/"
diff --git a/tools/equibind.json b/tools/equibind.json
@@ -2,6 +2,7 @@
   "class": "CommandLineTool",
   "name": "equibind",
   "description": "Docking of small molecules to a protein",
+  "author": "@misc{stärk2022equibind,\n      title={EquiBind: Geometric Deep Learning for Drug Binding Structure Prediction}, \n      author={Hannes Stärk and Octavian-Eugen Ganea and Lagnajit Pattanaik and Regina Barzilay and Tommi Jaakkola},\n      year={2022},\n      eprint={2202.05146},\n      archivePrefix={arXiv},\n      primaryClass={q-bio.BM}\n}",
   "baseCommand": ["/bin/bash", "-c"],
   "arguments": [
     "mkdir -p /tmp-inputs/tmp;",
@@ -39,4 +40,4 @@
       "glob": ["*.pdb"]
     }
   }
-}
+}
diff --git a/tools/fastqc/fastqc.json b/tools/fastqc/fastqc.json
@@ -2,6 +2,7 @@
   "class": "Tool",
   "name": "fastqc",
   "description": "Comprehensive quality control tool for high-throughput sequence data",
+  "author": "",
   "doi": "https://doi.org/10.48550/arXiv.2202.05146",
   "baseCommand": ["/bin/bash", "-c"],
   "arguments": [
diff --git a/tools/gnina/gnina.json b/tools/gnina/gnina.json
@@ -2,6 +2,7 @@
     "class": "CommandLineTool",
     "name": "gnina",
     "description": "Protein-ligand docking using Gnina",
+    "author": "",
     "baseCommand": ["/bin/bash", "-c"],
     "arguments": [
       "gnina -r $(inputs.protein.filepath) -l $(inputs.small_molecule.filepath) --exhaustiveness $(inputs.exhaustiveness.default) --autobox_ligand $(inputs.protein.filepath) --cnn_scoring $(inputs.cnn_scoring.default) -o /outputs/$(inputs.protein.basename)_$(inputs.small_molecule.basename)_docked_scored.sdf"
diff --git a/tools/oddt.json b/tools/oddt.json
@@ -2,6 +2,7 @@
     "class": "CommandLineTool",
     "name": "oddt",
     "description": "Scoring of protein-ligand complexes using ODDT",
+    "author": "",
     "baseCommand": ["/bin/bash", "-c"],
     "arguments": [
       "mkdir -p /tmp-out && oddt_cli $(inputs.small_molecule.filepath) --receptor $(inputs.protein.filepath) --score rfscore_v1 --score rfscore_v2 --score rfscore_v3 --score nnscore -O /tmp-out/$(inputs.protein.basename)_$(inputs.small_molecule.basename)_scored.$(inputs.small_molecule.ext) && cd /tmp-out && /app/aggregate_score.sh && cp /tmp-out/* /outputs"
diff --git a/tools/openbabel/pdb-to-sdf-openbabel.json b/tools/openbabel/pdb-to-sdf-openbabel.json
@@ -2,6 +2,7 @@
     "class": "CommandLineTool",
     "name": "pdb to sdf",
     "description": "Convert pdb to sdf using openbabel.",
+    "author": "",
     "baseCommand": ["/bin/bash", "-c"],
     "arguments": [
       "obabel $(inputs.pdb_file.filepath) -O /outputs/$(inputs.pdb_file.basename).sdf;"
diff --git a/tools/openbabel/rmsd-openbabel.json b/tools/openbabel/rmsd-openbabel.json
@@ -2,6 +2,7 @@
     "class": "CommandLineTool",
     "name": "rmsd",
     "description": "calculate the RMSD of a reference small molecule and a docked small molecule using openbabel.",
+    "author": "",
     "baseCommand": ["/bin/bash", "-c"],
     "arguments": [
       "echo 'reference,comparison,RMSD' > /outputs/rmsd.csv && echo -n '$(inputs.reference_structure.basename),$(inputs.comparison_structure.basename),' > /outputs/temp.csv && obrms -firstonly $(inputs.reference_structure.filepath) $(inputs.comparison_structure.filepath) | awk '{print $2}' | tr -d '\\n' >> /outputs/temp.csv && cat /outputs/temp.csv >> /outputs/rmsd.csv && rm /outputs/temp.csv;"
diff --git a/tools/protbert/protbert.json b/tools/protbert/protbert.json
@@ -2,6 +2,7 @@
     "class": "CommandLineTool",
     "name": "protbert",
     "description": "Predicting unknown residues with protein language models",
+    "author": "",
     "baseCommand": ["/bin/bash", "-c"],
     "arguments": [
       "python3 app.py $(inputs.protein_sequence.filepath) /outputs --mode fill-mask;",
diff --git a/tools/rfdiffusion.json b/tools/rfdiffusion.json
diff --git a/tools/tetools/repeatmodeler.json b/tools/tetools/repeatmodeler.json

Original file line number	Diff line number	Diff line change
`@@ -2,6 +2,7 @@`
`2`	`2`	`"class": "CommandLineTool",`
`3`	`3`	`"name": "equibind",`
`4`	`4`	`"description": "Docking of small molecules to a protein",`
	`5`	`+ "author": "@misc{stärk2022equibind,\n title={EquiBind: Geometric Deep Learning for Drug Binding Structure Prediction}, \n author={Hannes Stärk and Octavian-Eugen Ganea and Lagnajit Pattanaik and Regina Barzilay and Tommi Jaakkola},\n year={2022},\n eprint={2202.05146},\n archivePrefix={arXiv},\n primaryClass={q-bio.BM}\n}",`
`5`	`6`	`"baseCommand": ["/bin/bash", "-c"],`
`6`	`7`	`"arguments": [`
`7`	`8`	`"mkdir -p /tmp-inputs/tmp;",`
`@@ -39,4 +40,4 @@`
`39`	`40`	`"glob": ["*.pdb"]`
`40`	`41`	`}`
`41`	`42`	`}`
`42`		`-}`
	`43`	`+}`