From 0037fe0ba09c45017085e530c49d3267f3c9b2f0 Mon Sep 17 00:00:00 2001 From: Alexander Veit <53857412+alexander-veit@users.noreply.github.com> Date: Thu, 15 Aug 2024 12:06:26 -0400 Subject: [PATCH 1/2] Add RNA-Seq MWFR --- CHANGELOG.rst | 7 ++ .../commands/create_meta_workflow_run.py | 63 ++++++++++- magma_smaht/commands/wrangler_utils.py | 4 +- magma_smaht/create_metawfr.py | 107 ++++++++++++++++-- magma_smaht/utils.py | 23 ++++ magma_smaht/wrangler_utils.py | 5 +- pyproject.toml | 2 +- 7 files changed, 193 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index fb26471..188e7e3 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -3,6 +3,13 @@ Change Log ========== +3.7.0 +===== +* Add support for BAM2FASTQ MetaWorkflowRun +* Add RNA-Seq alignment MetaWorkflowRun + + + 3.6.2 ===== * Add short_reads_FASTQ_quality_metrics MetaWorkflowRun diff --git a/magma_smaht/commands/create_meta_workflow_run.py b/magma_smaht/commands/create_meta_workflow_run.py index e65fd1b..8e19439 100644 --- a/magma_smaht/commands/create_meta_workflow_run.py +++ b/magma_smaht/commands/create_meta_workflow_run.py @@ -2,16 +2,18 @@ from magma_smaht.create_metawfr import ( mwfr_illumina_alignment, + mwfr_rnaseq_alignment, mwfr_pacbio_alignment, mwfr_fastqc, mwfr_hic_alignment, mwfr_ont_alignment, mwfr_cram_to_fastq_paired_end, + mwfr_bam_to_fastq_paired_end, mwfr_bamqc_short_read, mwfr_ubam_qc_long_read, mwfr_ultra_long_bamqc, mwfr_long_read_bamqc, - mwfr_short_read_fastqc + mwfr_short_read_fastqc, ) from magma_smaht.utils import get_auth_key @@ -48,6 +50,31 @@ def align_illumina(fileset_accession, length_required, auth_env): mwfr_illumina_alignment(fileset_accession, length_required, smaht_key) +@cli.command() +@click.help_option("--help", "-h") +@click.option( + "-f", "--fileset-accession", required=True, type=str, help="Fileset accession" +) +@click.option( + "-l", + "--sequence-length", + required=True, + type=int, + help="Sequence length (can be obtained from FastQC output)", +) +@click.option( + "-e", + "--auth-env", + required=True, + type=str, + help="Name of environment in smaht-keys file", +) +def align_rnaseq(fileset_accession, sequence_length, auth_env): + """Alignment MWFR for RNA-Seq data""" + smaht_key = get_auth_key(auth_env) + mwfr_rnaseq_alignment(fileset_accession, sequence_length, smaht_key) + + @cli.command() @click.help_option("--help", "-h") @click.option( @@ -107,6 +134,14 @@ def align_ont(fileset_accession, auth_env): @click.option( "-f", "--fileset-accession", required=True, type=str, help="Fileset accession" ) +@click.option( + "-c", + "--check-lanes", + required=True, + default=True, + type=bool, + help="Wether to check lanes or not (different MWFs)", +) @click.option( "-e", "--auth-env", @@ -114,17 +149,15 @@ def align_ont(fileset_accession, auth_env): type=str, help="Name of environment in smaht-keys file", ) -def qc_short_read_fastq_illumina(fileset_accession, auth_env): +def qc_short_read_fastq_illumina(fileset_accession, check_lanes, auth_env): """QC MWFR for paired short-read Illumina FASTQs""" smaht_key = get_auth_key(auth_env) - mwfr_fastqc(fileset_accession, smaht_key) + mwfr_fastqc(fileset_accession, check_lanes, smaht_key) @cli.command() @click.help_option("--help", "-h") -@click.option( - "-f", "--file-accession", required=True, type=str, help="File accession" -) +@click.option("-f", "--file-accession", required=True, type=str, help="File accession") @click.option( "-e", "--auth-env", @@ -222,5 +255,23 @@ def conversion_cram_to_fastq(fileset_accession, auth_env): mwfr_cram_to_fastq_paired_end(fileset_accession, smaht_key) +@cli.command() +@click.help_option("--help", "-h") +@click.option( + "-f", "--fileset-accession", required=True, type=str, help="Fileset accession" +) +@click.option( + "-e", + "--auth-env", + required=True, + type=str, + help="Name of environment in smaht-keys file", +) +def conversion_bam_to_fastq(fileset_accession, auth_env): + """Conversion MWFR for BAM to FASTQ (paired-end)""" + smaht_key = get_auth_key(auth_env) + mwfr_bam_to_fastq_paired_end(fileset_accession, smaht_key) + + if __name__ == "__main__": cli() diff --git a/magma_smaht/commands/wrangler_utils.py b/magma_smaht/commands/wrangler_utils.py index b7bc406..d9156f2 100644 --- a/magma_smaht/commands/wrangler_utils.py +++ b/magma_smaht/commands/wrangler_utils.py @@ -26,8 +26,8 @@ def cli(): type=str, help="Name of environment in smaht-keys file", ) -def cram2fastq_out_to_fileset(mwfr_identifier, auth_env): - """Associate CRAM2FASTQ output with fileset""" +def associate_conversion_output_with_fileset(mwfr_identifier, auth_env): + """Associate CRAM2FASTQ or BAM2FASTQ output with fileset""" smaht_key = get_auth_key(auth_env) wrangler_utils.associate_conversion_output_with_fileset(mwfr_identifier, smaht_key) diff --git a/magma_smaht/create_metawfr.py b/magma_smaht/create_metawfr.py index 4c512ae..f2cf48f 100644 --- a/magma_smaht/create_metawfr.py +++ b/magma_smaht/create_metawfr.py @@ -22,6 +22,7 @@ get_latest_mwf, get_file_set, get_library_from_file_set, + get_library_preparation_from_library, get_sample_from_library, get_mwfr_file_input_arg, get_mwfr_parameter_input_arg, @@ -30,6 +31,7 @@ # MetaWorkflow names are used to get the latest version. # We assume that they don't change! MWF_NAME_ILLUMINA = "Illumina_alignment_GRCh38" +MWF_NAME_RNASEQ = "RNA-seq_bulk_short_reads_GRCh38" MWF_NAME_ONT = "ONT_alignment_GRCh38" MWF_NAME_PACBIO = "PacBio_alignment_GRCh38" MWF_NAME_HIC = "Hi-C_alignment_GRCh38" @@ -37,6 +39,7 @@ MWF_NAME_FASTQ_LONG_READ = "long_reads_FASTQ_quality_metrics" MWF_NAME_FASTQ_SHORT_READ = "short_reads_FASTQ_quality_metrics" MWF_NAME_CRAM_TO_FASTQ_PAIRED_END = "cram_to_fastq_paired-end" +MWF_NAME_BAM_TO_FASTQ_PAIRED_END = "bam_to_fastq_paired-end" MWF_NAME_BAMQC_SHORT_READ = "paired-end_short_reads_BAM_quality_metrics_GRCh38" MWF_NAME_ULTRA_LONG_BAMQC = "ultra-long_reads_BAM_quality_metrics_GRCh38" MWF_NAME_LONG_READ_BAMQC = "long_reads_BAM_quality_metrics_GRCh38" @@ -52,6 +55,9 @@ SAMPLE_NAME = "sample_name" LENGTH_REQUIRED = "length_required" LIBRARY_ID = "library_id" +GENOME_REFERENCE_STAR = "genome_reference_star" +IS_STRANDED = "is_stranded" +STRANDEDNESS = "strandedness" # Schema fields COMMON_FIELDS = "common_fields" @@ -64,6 +70,8 @@ ACCESSION = "accession" ALIASES = "aliases" UPLOADED = "uploaded" +FIRST_STRANDED = "First Stranded" +SECOND_STRANDED = "Second Stranded" ################################################ @@ -88,6 +96,56 @@ def mwfr_illumina_alignment(fileset_accession, length_required, smaht_key): ) +def mwfr_rnaseq_alignment(fileset_accession, sequence_length, smaht_key): + """Creates a MetaWorflowRun item in the portal for RNA-Seq alignment of submitted files within a fileset""" + + mwf = get_latest_mwf(MWF_NAME_RNASEQ, smaht_key) + print(f"Using MetaWorkflow {mwf[ACCESSION]} ({mwf[ALIASES][0]})") + + file_set = get_file_set(fileset_accession, smaht_key) + mwfr_input = get_core_alignment_mwfr_input_from_readpairs( + file_set, INPUT_FILES_R1_FASTQ_GZ, INPUT_FILES_R2_FASTQ_GZ, smaht_key + ) + # RNA-Seq specific input + genome_reference_star_alias = f"smaht:ReferenceFile-star-index-no-alt-no-hla-gencode45-oh{sequence_length-1}_GCA_000001405.15_GRCh38_no_decoy" + search_reference_file = "?type=File" f"&aliases={genome_reference_star_alias}" + reference_files = ff_utils.search_metadata( + f"/search/{search_reference_file}", key=smaht_key + ) + if len(reference_files) != 1: + raise Exception( + f"Did not find exactly one genome_reference_star reference file. Search was: {search_reference_file}" + ) + genome_reference_star_file = reference_files[0] + + mwfr_input.append( + get_mwfr_file_input_arg( + GENOME_REFERENCE_STAR, + [{"file": genome_reference_star_file[UUID]}], + ) + ) + + # Get strandedness info + library = get_library_from_file_set(file_set, smaht_key) + library_preparation = get_library_preparation_from_library(library, smaht_key) + strand = library_preparation["strand"] + + if strand in [FIRST_STRANDED, SECOND_STRANDED]: + strandedness_mapping = {FIRST_STRANDED: "rf", SECOND_STRANDED: "fr"} + mwfr_input.extend( + [ + get_mwfr_parameter_input_arg(IS_STRANDED, "true"), + get_mwfr_parameter_input_arg( + STRANDEDNESS, strandedness_mapping[strand] + ), + ] + ) + + create_and_post_mwfr( + mwf[UUID], file_set, INPUT_FILES_R1_FASTQ_GZ, mwfr_input, smaht_key + ) + + def mwfr_pacbio_alignment(fileset_accession, smaht_key): """Creates a MetaWorflowRun item in the portal for PacBio alignment of submitted files within a fileset""" @@ -196,19 +254,51 @@ def mwfr_cram_to_fastq_paired_end(fileset_accession, smaht_key): get_mwfr_file_input_arg(INPUT_FILES_CRAM, crams), get_mwfr_file_input_arg(GENOME_REFERENCE_FASTA, reference_genome), ] - # pprint.pprint(mwfr_input) create_and_post_mwfr(mwf[UUID], file_set, INPUT_FILES_CRAM, mwfr_input, smaht_key) +def mwfr_bam_to_fastq_paired_end(fileset_accession, smaht_key): + file_set = get_file_set(fileset_accession, smaht_key) + mwf = get_latest_mwf(MWF_NAME_BAM_TO_FASTQ_PAIRED_END, smaht_key) + print(f"Using MetaWorkflow {mwf[ACCESSION]} ({mwf[ALIASES][0]})") + + # Get submitted CRAMs in fileset (can be aligned or unaligned) + search_filter = ( + f"?file_sets.uuid={file_set[UUID]}" + "&type=SubmittedFile" + "&file_format.display_title=bam" + ) + files_to_run = ff_utils.search_metadata((f"search/{search_filter}"), key=smaht_key) + files_to_run.reverse() + + if len(files_to_run) == 0: + print(f"No files to run for search {search_filter}") + return + + bams = [] + for dim, file in enumerate(files_to_run): + bams.append({"file": file[UUID], "dimension": f"{dim}"}) + + mwfr_input = [ + get_mwfr_file_input_arg(INPUT_FILES_BAM, bams), + ] + create_and_post_mwfr(mwf[UUID], file_set, INPUT_FILES_BAM, mwfr_input, smaht_key) + + ################################################ # QC MetaWorkflowRuns ################################################ -def mwfr_fastqc(fileset_accession, smaht_key): +def mwfr_fastqc(fileset_accession, check_lanes, smaht_key): file_set = get_file_set(fileset_accession, smaht_key) - mwf = get_latest_mwf(MWF_NAME_FASTQC, smaht_key) + if check_lanes: + print(f"Using MetaWorkflow {MWF_NAME_FASTQC}") + mwf = get_latest_mwf(MWF_NAME_FASTQC, smaht_key) + else: + print(f"Using MetaWorkflow {MWF_NAME_FASTQ_SHORT_READ}") + mwf = get_latest_mwf(MWF_NAME_FASTQ_SHORT_READ, smaht_key) print(f"Using MetaWorkflow {mwf[ACCESSION]} ({mwf[ALIASES][0]})") # Get unaligned R2 reads in the fileset that don't have already QC @@ -245,6 +335,11 @@ def mwfr_fastqc(fileset_accession, smaht_key): files_input_list = sorted(files_input, key=lambda x: x["dimension"]) + # If we are not running the Illumina MWF, reformat to 1D list + if not check_lanes: + for i, inp in enumerate(files_input_list): + inp["dimension"] = f"{i}" + mwfr_input = [get_mwfr_file_input_arg(INPUT_FILES_FASTQ_GZ, files_input_list)] create_and_post_mwfr( mwf[UUID], file_set, INPUT_FILES_FASTQ_GZ, mwfr_input, smaht_key @@ -252,7 +347,7 @@ def mwfr_fastqc(fileset_accession, smaht_key): def mwfr_ubam_qc_long_read(fileset_accession, smaht_key): - + file_set = get_file_set(fileset_accession, smaht_key) mwf = get_latest_mwf(MWF_NAME_FASTQ_LONG_READ, smaht_key) print(f"Using MetaWorkflow {mwf[ACCESSION]} ({mwf[ALIASES][0]})") @@ -391,9 +486,7 @@ def get_core_alignment_mwfr_input(file_set, file_input_arg, smaht_key): sample = get_sample_from_library(library, smaht_key) search_filter = ( - "?type=UnalignedReads" - f"&file_sets.uuid={file_set[UUID]}" - f"&status={UPLOADED}" + "?type=UnalignedReads" f"&file_sets.uuid={file_set[UUID]}" f"&status={UPLOADED}" ) search_result = ff_utils.search_metadata(f"/search/{search_filter}", key=smaht_key) search_result.reverse() diff --git a/magma_smaht/utils.py b/magma_smaht/utils.py index 3534555..c5dbc1a 100644 --- a/magma_smaht/utils.py +++ b/magma_smaht/utils.py @@ -201,6 +201,29 @@ def get_sample_from_library(library, smaht_key): return sample +def get_library_preparation_from_library(library, smaht_key): + """Get the library preparation that is associated with a library + + Args: + library (dict): library item from portal + smaht_key (dict): SMaHT key + + Raises: + Exception: Raises an exception when there is no library preparation + + Returns: + dict: library_preparation item from portal + """ + library_preparation = library.get("library_preparation") + if not library_preparation: + raise Exception(f"No library preparation found for library {library['accession']}") + + library_preparation_item = ff_utils.get_metadata( + library_preparation, add_on="frame=raw&datastore=database", key=smaht_key + ) + return library_preparation_item + + def get_latest_mwf(mwf_name, smaht_key): """Get the latest version of the MWF with name `mwf_name` diff --git a/magma_smaht/wrangler_utils.py b/magma_smaht/wrangler_utils.py index f316d23..21c8ef4 100644 --- a/magma_smaht/wrangler_utils.py +++ b/magma_smaht/wrangler_utils.py @@ -14,8 +14,9 @@ JsonObject = Dict[str, Any] WF_CRAM_TO_FASTQ_PAIRED_END = "cram_to_fastq_paired-end" +WF_BAM_TO_FASTQ_PAIRED_END = "bam_to_fastq_paired-end" -SUPPORTED_MWF = [MWF_NAME_CRAM_TO_FASTQ_PAIRED_END] +SUPPORTED_MWF = [MWF_NAME_CRAM_TO_FASTQ_PAIRED_END, WF_BAM_TO_FASTQ_PAIRED_END] # Portal constants COMPLETED = "completed" @@ -48,7 +49,7 @@ def associate_conversion_output_with_fileset( for wfr in mwfr_meta["workflow_runs"]: output = wfr["output"] - if wfr["name"] != WF_CRAM_TO_FASTQ_PAIRED_END: + if wfr["name"] not in [WF_CRAM_TO_FASTQ_PAIRED_END, WF_BAM_TO_FASTQ_PAIRED_END]: continue if len(output) != 2: print_error_and_exit(f"Expected exactly 2 output files in WorkflowRun") diff --git a/pyproject.toml b/pyproject.toml index 3a78190..7d213db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "magma-suite" -version = "3.6.2" +version = "3.7.0" description = "Collection of tools to manage meta-workflows automation." authors = ["Michele Berselli ", "Doug Rioux", "Soo Lee", "CGAP team"] license = "MIT" From b1847e5e01ac762d26eb907f9c276684ef1069c5 Mon Sep 17 00:00:00 2001 From: Alexander Veit <53857412+alexander-veit@users.noreply.github.com> Date: Thu, 15 Aug 2024 13:37:35 -0400 Subject: [PATCH 2/2] Update create_meta_workflow_run.py --- magma_smaht/commands/create_meta_workflow_run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/magma_smaht/commands/create_meta_workflow_run.py b/magma_smaht/commands/create_meta_workflow_run.py index 8e19439..8238b24 100644 --- a/magma_smaht/commands/create_meta_workflow_run.py +++ b/magma_smaht/commands/create_meta_workflow_run.py @@ -140,7 +140,7 @@ def align_ont(fileset_accession, auth_env): required=True, default=True, type=bool, - help="Wether to check lanes or not (different MWFs)", + help="Whether to check lanes or not (different MWFs)", ) @click.option( "-e",