Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add RNA-Seq MWFR for SMaHT #39

Merged
merged 2 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@
Change Log
==========

3.7.0
=====
* Add support for BAM2FASTQ MetaWorkflowRun
* Add RNA-Seq alignment MetaWorkflowRun



3.6.2
=====
* Add short_reads_FASTQ_quality_metrics MetaWorkflowRun
Expand Down
63 changes: 57 additions & 6 deletions magma_smaht/commands/create_meta_workflow_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,18 @@

from magma_smaht.create_metawfr import (
mwfr_illumina_alignment,
mwfr_rnaseq_alignment,
mwfr_pacbio_alignment,
mwfr_fastqc,
mwfr_hic_alignment,
mwfr_ont_alignment,
mwfr_cram_to_fastq_paired_end,
mwfr_bam_to_fastq_paired_end,
mwfr_bamqc_short_read,
mwfr_ubam_qc_long_read,
mwfr_ultra_long_bamqc,
mwfr_long_read_bamqc,
mwfr_short_read_fastqc
mwfr_short_read_fastqc,
)
from magma_smaht.utils import get_auth_key

Expand Down Expand Up @@ -48,6 +50,31 @@ def align_illumina(fileset_accession, length_required, auth_env):
mwfr_illumina_alignment(fileset_accession, length_required, smaht_key)


@cli.command()
@click.help_option("--help", "-h")
@click.option(
"-f", "--fileset-accession", required=True, type=str, help="Fileset accession"
)
@click.option(
"-l",
"--sequence-length",
required=True,
type=int,
help="Sequence length (can be obtained from FastQC output)",
)
@click.option(
"-e",
"--auth-env",
required=True,
type=str,
help="Name of environment in smaht-keys file",
)
def align_rnaseq(fileset_accession, sequence_length, auth_env):
"""Alignment MWFR for RNA-Seq data"""
smaht_key = get_auth_key(auth_env)
mwfr_rnaseq_alignment(fileset_accession, sequence_length, smaht_key)


@cli.command()
@click.help_option("--help", "-h")
@click.option(
Expand Down Expand Up @@ -107,24 +134,30 @@ def align_ont(fileset_accession, auth_env):
@click.option(
"-f", "--fileset-accession", required=True, type=str, help="Fileset accession"
)
@click.option(
"-c",
"--check-lanes",
required=True,
default=True,
type=bool,
help="Whether to check lanes or not (different MWFs)",
)
@click.option(
"-e",
"--auth-env",
required=True,
type=str,
help="Name of environment in smaht-keys file",
)
def qc_short_read_fastq_illumina(fileset_accession, auth_env):
def qc_short_read_fastq_illumina(fileset_accession, check_lanes, auth_env):
"""QC MWFR for paired short-read Illumina FASTQs"""
smaht_key = get_auth_key(auth_env)
mwfr_fastqc(fileset_accession, smaht_key)
mwfr_fastqc(fileset_accession, check_lanes, smaht_key)


@cli.command()
@click.help_option("--help", "-h")
@click.option(
"-f", "--file-accession", required=True, type=str, help="File accession"
)
@click.option("-f", "--file-accession", required=True, type=str, help="File accession")
@click.option(
"-e",
"--auth-env",
Expand Down Expand Up @@ -222,5 +255,23 @@ def conversion_cram_to_fastq(fileset_accession, auth_env):
mwfr_cram_to_fastq_paired_end(fileset_accession, smaht_key)


@cli.command()
@click.help_option("--help", "-h")
@click.option(
"-f", "--fileset-accession", required=True, type=str, help="Fileset accession"
)
@click.option(
"-e",
"--auth-env",
required=True,
type=str,
help="Name of environment in smaht-keys file",
)
def conversion_bam_to_fastq(fileset_accession, auth_env):
"""Conversion MWFR for BAM to FASTQ (paired-end)"""
smaht_key = get_auth_key(auth_env)
mwfr_bam_to_fastq_paired_end(fileset_accession, smaht_key)


if __name__ == "__main__":
cli()
4 changes: 2 additions & 2 deletions magma_smaht/commands/wrangler_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ def cli():
type=str,
help="Name of environment in smaht-keys file",
)
def cram2fastq_out_to_fileset(mwfr_identifier, auth_env):
"""Associate CRAM2FASTQ output with fileset"""
def associate_conversion_output_with_fileset(mwfr_identifier, auth_env):
"""Associate CRAM2FASTQ or BAM2FASTQ output with fileset"""
smaht_key = get_auth_key(auth_env)
wrangler_utils.associate_conversion_output_with_fileset(mwfr_identifier, smaht_key)

Expand Down
107 changes: 100 additions & 7 deletions magma_smaht/create_metawfr.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
get_latest_mwf,
get_file_set,
get_library_from_file_set,
get_library_preparation_from_library,
get_sample_from_library,
get_mwfr_file_input_arg,
get_mwfr_parameter_input_arg,
Expand All @@ -30,13 +31,15 @@
# MetaWorkflow names are used to get the latest version.
# We assume that they don't change!
MWF_NAME_ILLUMINA = "Illumina_alignment_GRCh38"
MWF_NAME_RNASEQ = "RNA-seq_bulk_short_reads_GRCh38"
MWF_NAME_ONT = "ONT_alignment_GRCh38"
MWF_NAME_PACBIO = "PacBio_alignment_GRCh38"
MWF_NAME_HIC = "Hi-C_alignment_GRCh38"
MWF_NAME_FASTQC = "Illumina_FASTQ_quality_metrics"
MWF_NAME_FASTQ_LONG_READ = "long_reads_FASTQ_quality_metrics"
MWF_NAME_FASTQ_SHORT_READ = "short_reads_FASTQ_quality_metrics"
MWF_NAME_CRAM_TO_FASTQ_PAIRED_END = "cram_to_fastq_paired-end"
MWF_NAME_BAM_TO_FASTQ_PAIRED_END = "bam_to_fastq_paired-end"
MWF_NAME_BAMQC_SHORT_READ = "paired-end_short_reads_BAM_quality_metrics_GRCh38"
MWF_NAME_ULTRA_LONG_BAMQC = "ultra-long_reads_BAM_quality_metrics_GRCh38"
MWF_NAME_LONG_READ_BAMQC = "long_reads_BAM_quality_metrics_GRCh38"
Expand All @@ -52,6 +55,9 @@
SAMPLE_NAME = "sample_name"
LENGTH_REQUIRED = "length_required"
LIBRARY_ID = "library_id"
GENOME_REFERENCE_STAR = "genome_reference_star"
IS_STRANDED = "is_stranded"
STRANDEDNESS = "strandedness"

# Schema fields
COMMON_FIELDS = "common_fields"
Expand All @@ -64,6 +70,8 @@
ACCESSION = "accession"
ALIASES = "aliases"
UPLOADED = "uploaded"
FIRST_STRANDED = "First Stranded"
SECOND_STRANDED = "Second Stranded"


################################################
Expand All @@ -88,6 +96,56 @@ def mwfr_illumina_alignment(fileset_accession, length_required, smaht_key):
)


def mwfr_rnaseq_alignment(fileset_accession, sequence_length, smaht_key):
"""Creates a MetaWorflowRun item in the portal for RNA-Seq alignment of submitted files within a fileset"""

mwf = get_latest_mwf(MWF_NAME_RNASEQ, smaht_key)
print(f"Using MetaWorkflow {mwf[ACCESSION]} ({mwf[ALIASES][0]})")

file_set = get_file_set(fileset_accession, smaht_key)
mwfr_input = get_core_alignment_mwfr_input_from_readpairs(
file_set, INPUT_FILES_R1_FASTQ_GZ, INPUT_FILES_R2_FASTQ_GZ, smaht_key
)
# RNA-Seq specific input
genome_reference_star_alias = f"smaht:ReferenceFile-star-index-no-alt-no-hla-gencode45-oh{sequence_length-1}_GCA_000001405.15_GRCh38_no_decoy"
search_reference_file = "?type=File" f"&aliases={genome_reference_star_alias}"
reference_files = ff_utils.search_metadata(
f"/search/{search_reference_file}", key=smaht_key
)
if len(reference_files) != 1:
raise Exception(
f"Did not find exactly one genome_reference_star reference file. Search was: {search_reference_file}"
)
genome_reference_star_file = reference_files[0]

mwfr_input.append(
get_mwfr_file_input_arg(
GENOME_REFERENCE_STAR,
[{"file": genome_reference_star_file[UUID]}],
)
)

# Get strandedness info
library = get_library_from_file_set(file_set, smaht_key)
library_preparation = get_library_preparation_from_library(library, smaht_key)
strand = library_preparation["strand"]

if strand in [FIRST_STRANDED, SECOND_STRANDED]:
strandedness_mapping = {FIRST_STRANDED: "rf", SECOND_STRANDED: "fr"}
mwfr_input.extend(
[
get_mwfr_parameter_input_arg(IS_STRANDED, "true"),
get_mwfr_parameter_input_arg(
STRANDEDNESS, strandedness_mapping[strand]
),
]
)

create_and_post_mwfr(
mwf[UUID], file_set, INPUT_FILES_R1_FASTQ_GZ, mwfr_input, smaht_key
)


def mwfr_pacbio_alignment(fileset_accession, smaht_key):
"""Creates a MetaWorflowRun item in the portal for PacBio alignment of submitted files within a fileset"""

Expand Down Expand Up @@ -196,19 +254,51 @@ def mwfr_cram_to_fastq_paired_end(fileset_accession, smaht_key):
get_mwfr_file_input_arg(INPUT_FILES_CRAM, crams),
get_mwfr_file_input_arg(GENOME_REFERENCE_FASTA, reference_genome),
]
# pprint.pprint(mwfr_input)
create_and_post_mwfr(mwf[UUID], file_set, INPUT_FILES_CRAM, mwfr_input, smaht_key)


def mwfr_bam_to_fastq_paired_end(fileset_accession, smaht_key):
file_set = get_file_set(fileset_accession, smaht_key)
mwf = get_latest_mwf(MWF_NAME_BAM_TO_FASTQ_PAIRED_END, smaht_key)
print(f"Using MetaWorkflow {mwf[ACCESSION]} ({mwf[ALIASES][0]})")

# Get submitted CRAMs in fileset (can be aligned or unaligned)
search_filter = (
f"?file_sets.uuid={file_set[UUID]}"
"&type=SubmittedFile"
"&file_format.display_title=bam"
)
files_to_run = ff_utils.search_metadata((f"search/{search_filter}"), key=smaht_key)
files_to_run.reverse()

if len(files_to_run) == 0:
print(f"No files to run for search {search_filter}")
return

bams = []
for dim, file in enumerate(files_to_run):
bams.append({"file": file[UUID], "dimension": f"{dim}"})

mwfr_input = [
get_mwfr_file_input_arg(INPUT_FILES_BAM, bams),
]
create_and_post_mwfr(mwf[UUID], file_set, INPUT_FILES_BAM, mwfr_input, smaht_key)


################################################
# QC MetaWorkflowRuns
################################################


def mwfr_fastqc(fileset_accession, smaht_key):
def mwfr_fastqc(fileset_accession, check_lanes, smaht_key):

file_set = get_file_set(fileset_accession, smaht_key)
mwf = get_latest_mwf(MWF_NAME_FASTQC, smaht_key)
if check_lanes:
print(f"Using MetaWorkflow {MWF_NAME_FASTQC}")
mwf = get_latest_mwf(MWF_NAME_FASTQC, smaht_key)
else:
print(f"Using MetaWorkflow {MWF_NAME_FASTQ_SHORT_READ}")
mwf = get_latest_mwf(MWF_NAME_FASTQ_SHORT_READ, smaht_key)
print(f"Using MetaWorkflow {mwf[ACCESSION]} ({mwf[ALIASES][0]})")

# Get unaligned R2 reads in the fileset that don't have already QC
Expand Down Expand Up @@ -245,14 +335,19 @@ def mwfr_fastqc(fileset_accession, smaht_key):

files_input_list = sorted(files_input, key=lambda x: x["dimension"])

# If we are not running the Illumina MWF, reformat to 1D list
if not check_lanes:
for i, inp in enumerate(files_input_list):
inp["dimension"] = f"{i}"

mwfr_input = [get_mwfr_file_input_arg(INPUT_FILES_FASTQ_GZ, files_input_list)]
create_and_post_mwfr(
mwf[UUID], file_set, INPUT_FILES_FASTQ_GZ, mwfr_input, smaht_key
)


def mwfr_ubam_qc_long_read(fileset_accession, smaht_key):

file_set = get_file_set(fileset_accession, smaht_key)
mwf = get_latest_mwf(MWF_NAME_FASTQ_LONG_READ, smaht_key)
print(f"Using MetaWorkflow {mwf[ACCESSION]} ({mwf[ALIASES][0]})")
Expand Down Expand Up @@ -391,9 +486,7 @@ def get_core_alignment_mwfr_input(file_set, file_input_arg, smaht_key):
sample = get_sample_from_library(library, smaht_key)

search_filter = (
"?type=UnalignedReads"
f"&file_sets.uuid={file_set[UUID]}"
f"&status={UPLOADED}"
"?type=UnalignedReads" f"&file_sets.uuid={file_set[UUID]}" f"&status={UPLOADED}"
)
search_result = ff_utils.search_metadata(f"/search/{search_filter}", key=smaht_key)
search_result.reverse()
Expand Down
23 changes: 23 additions & 0 deletions magma_smaht/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,29 @@ def get_sample_from_library(library, smaht_key):
return sample


def get_library_preparation_from_library(library, smaht_key):
"""Get the library preparation that is associated with a library

Args:
library (dict): library item from portal
smaht_key (dict): SMaHT key

Raises:
Exception: Raises an exception when there is no library preparation

Returns:
dict: library_preparation item from portal
"""
library_preparation = library.get("library_preparation")
if not library_preparation:
raise Exception(f"No library preparation found for library {library['accession']}")

library_preparation_item = ff_utils.get_metadata(
library_preparation, add_on="frame=raw&datastore=database", key=smaht_key
)
return library_preparation_item


def get_latest_mwf(mwf_name, smaht_key):
"""Get the latest version of the MWF with name `mwf_name`

Expand Down
5 changes: 3 additions & 2 deletions magma_smaht/wrangler_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@
JsonObject = Dict[str, Any]

WF_CRAM_TO_FASTQ_PAIRED_END = "cram_to_fastq_paired-end"
WF_BAM_TO_FASTQ_PAIRED_END = "bam_to_fastq_paired-end"

SUPPORTED_MWF = [MWF_NAME_CRAM_TO_FASTQ_PAIRED_END]
SUPPORTED_MWF = [MWF_NAME_CRAM_TO_FASTQ_PAIRED_END, WF_BAM_TO_FASTQ_PAIRED_END]

# Portal constants
COMPLETED = "completed"
Expand Down Expand Up @@ -48,7 +49,7 @@ def associate_conversion_output_with_fileset(

for wfr in mwfr_meta["workflow_runs"]:
output = wfr["output"]
if wfr["name"] != WF_CRAM_TO_FASTQ_PAIRED_END:
if wfr["name"] not in [WF_CRAM_TO_FASTQ_PAIRED_END, WF_BAM_TO_FASTQ_PAIRED_END]:
continue
if len(output) != 2:
print_error_and_exit(f"Expected exactly 2 output files in WorkflowRun")
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "magma-suite"
version = "3.6.2"
version = "3.7.0"
description = "Collection of tools to manage meta-workflows automation."
authors = ["Michele Berselli <berselli.michele@gmail.com>", "Doug Rioux", "Soo Lee", "CGAP team"]
license = "MIT"
Expand Down
Loading