Mercurial > repos > estrain > microrunqc
changeset 24:dc12d6ac296d draft
Fixed bug in MLST ouput
author | estrain |
---|---|
date | Fri, 02 Feb 2024 15:16:46 +0000 |
parents | 7272a9c36149 |
children | 4f457838f4c9 |
files | median_size.py microrunqc.xml microrunqc/median_size.py microrunqc/microrunqc.xml microrunqc/mlst.loc.sample microrunqc/mlstAddFields.py microrunqc/run_fastq_scan.py microrunqc/sum_mlst.py microrunqc/tool_data_table_conf.xml.sample mlst.loc.sample mlstAddFields.py run_fastq_scan.py sum_mlst.py tool_data_table_conf.xml.sample |
diffstat | 14 files changed, 450 insertions(+), 445 deletions(-) [+] |
line wrap: on
line diff
--- a/median_size.py Fri Jan 19 11:50:31 2024 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,26 +0,0 @@ -#!/usr/bin/env - -## Errol Strain (estrain@gmail.com) -## calculate median insert size from sam file - -import numpy as np - -def get_data(infile): - lengths = [] - for line in infile: - if line.startswith('@'): - pass - else: - line = line.rsplit() - length = int(line[8]) - if length > 0: - lengths.append(length) - else: - pass - return lengths - -if __name__ == "__main__": - import sys - lengths = get_data(sys.stdin) - md = int(np.median(lengths)) -print(md)
--- a/microrunqc.xml Fri Jan 19 11:50:31 2024 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,200 +0,0 @@ -<tool id="microrunqc" name="microrunqc" version="1.0.2"> - - <requirements> - <requirement type="package" version="2.4.0">skesa</requirement> - <requirement type="package" version="2.23.0">mlst</requirement> - <requirement type="package" version="0.7.17">bwa</requirement> - <requirement type="package" version="1.0.1">fastq-scan</requirement> - </requirements> - - <command detect_errors="exit_code"><![CDATA[ - - skesa - - #set fqscan = "text" - #if $jobtype.select == "fastq_fr" - #set outname = $jobtype.fastq1.name - #set bwalist = str($jobtype.fastq1) + " " + str($jobtype.fastq2) - --fastq $jobtype.fastq1,$jobtype.fastq2 - #if $jobtype.fastq1.is_of_type("fastq.gz") - #set fqscan = "gz" - #else if $jobtype.fastq1.is_of_type("fastqsanger.gz") - #set fqscan = "gz" - #end if - #else if $jobtype.select == "fastq_pair" - #set outname = $jobtype.coll.name - #set bwalist = str($jobtype.coll.forward) + " " + str($jobtype.coll.reverse) - --fastq $jobtype.coll.forward,$jobtype.coll.reverse - #if $jobtype.coll.forward.is_of_type("fastq.gz") - #set fqscan = "gz" - #else if $jobtype.coll.forward.is_of_type("fastqsanger.gz") - #set fqscan = "gz" - #end if - #end if - - #set num_cores = 1 - - #if $options.select =="basic" - --cores $num_cores - --memory 8 - #else if $options.select=="advanced" - #if $options.cores - #set num_cores = $options.cores - --cores $options.cores - #end if - #if $options.memory - --memory $options.memory - #end if - #if $options.hash_count - --hash_count - #end if - #if $options.estimated_kmers - --estimated_kmers $options.estimated.kmers - #end if - #if $options.skip - --skip_bloom_filter - #end if - #if $options.kmer - --kmer $options.kmer - #end if - #if $options.min_count - --min_count $options.min_count - #end if - #if $options.max_kmer_count - --max_kmer_count $options.max_kmer_count - #end if - #if $options.vector_percent - --vector_percent $options.vector_percent - #end if - #if $options.insert_size - --insert_size $options.insert.size - #end if - #if $options.steps - --steps $options.steps - #end if - #if $options.fraction - --fraction $options.fraction - #end if - #if $options.max_snp_len - --max_snp_len $options.max_snp_len - #end if - #if $options.min_contig - --min_contig $options.min_contig - #end if - #if $options.allow_snps - --allow_snps - #end if - #end if - - > ${outname}.fasta; - - bwa index ${outname}.fasta; - bwa mem -t $num_cores ${outname}.fasta ${bwalist} | python $__tool_directory__/median_size.py > insert.median; - - mlst --nopath --threads $num_cores --datadir $mlst_databases.fields.path/pubmlst --blastdb $mlst_databases.fields.path/blast/mlst.fa - #if $options.select=="advanced" - #if $options.minid - --minid $options.minid - #end if - #if $options.mincov - --mincov $options.mincov - #end if - #if $options.minscore - --minscore $options.minscore - #end if - #end if - ${outname}.fasta > ${outname}.mlst_raw.tsv; - - python $__tool_directory__/mlstAddFields.py ${outname}.mlst_raw.tsv $mlst_databases.fields.path/pubmlst > ${outname}.mlst.tsv; - - python $__tool_directory__/run_fastq_scan.py --fastq ${bwalist} --out fq_out.tab --type ${fqscan}; - - python $__tool_directory__/sum_mlst.py --fasta ${outname}.fasta --mlst ${outname}.mlst.tsv --med insert.median --fqscan fq_out.tab --out sum_qc.txt - - ]]></command> - <inputs> - <conditional name="jobtype"> - <param name="select" type="select" label="Select Input"> - <option value="fastq_fr">Forward and Reverse FASTQ</option> - <option value="fastq_pair">Paired FASTQ Collection</option> - </param> - <when value="fastq_fr"> - <param name="fastq1" type="data" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz" label="Forward FASTQ" /> - <param name="fastq2" type="data" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz" label="Reverse FASTQ" /> - </when> - <when value="fastq_pair"> - <param name="coll" label="Paired FASTQ" type="data_collection" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz" collection_type="paired" /> - </when> - </conditional> - - <conditional name="options"> - <param name="select" type="select" label="Options Type"> - <option value="basic">Basic</option> - <option value="advanced">Advanced</option> - </param> - <when value="advanced"> - <param name="cores" optional="true" type="integer" label="Number of cores to use (Default=16)" value=""/> - <param name="memory" optional="true" type="integer" label="Memory available (Default=32GB)" value=""/> - <param name="hash_count" optional="true" type="boolean" label="hash counter"/> - <param name="estimated_kmers" optional="true" type="integer" label="Estimated number of unique kmers for bloom filter (Default=100)" value=""/> - <param name="skip" optional="true" type="boolean" label="skip bloom filter, use estimate kmers as the hash"/> - <param name="kmer" optional="true" type="integer" label="Minimal kmer length for assembly (Default=21)" value=""/> - <param name="min_count" optional="true" type="integer" label="Minimal count for kmers retained for comparing alternate choices" value=""/> - <param name="max_kmer_count" optional="true" type="integer" label="Minimum acceptable average count for estimating the maximal kmer length in reads" value=""/> - <param name="vector_percent" optional="true" type="float" label="Count for vectors as a fraction of the read number (0-1,1=disabled)" value=""> - <validator type="in_range" message="Must be float(0,1)." min="0" max="1"/> - </param> - <param name="insert_size" optional="true" type="integer" label="Expected insert size for paired reads" value=""/> - <param name="steps" optional="true" type="integer" label="Number of assembly iterations from minimal to maximal kmer length in reads (Default=11)" value=""/> - <param name="fraction" optional="true" type="float" label="Maximum noise to signal ratio acceptable for extension (Default=0.1)" value=""> - <validator type="in_range" message="Must be float(0,1)." min="0" max="1"/> - </param> - <param name="max_snp_len" optional="true" type="integer" label="Maximal snp length (Default=150)" value=""/> - <param name="min_contig" optional="true" type="integer" label="Minimal contig length reported in output (Default=200)" value=""/> - <param name="allow_snps" optional="true" type="boolean" label="Turn SNP discovery (Default=false)"/> - <param name="mincov" type="integer" label="Minimum DNA %coverage" value="10" help="Minimum DNA %coverage to report partial allele at all (default 10, must be between 0-100)" optional="true" /> - <param name="minid" type="integer" label="Minimum DNA %identity" value="95" min="0" max="100" help="Minimum DNA %identity of full allelle to consider 'similar' (default 95, must be between 0-100)" optional="true" /> - <param name="minscore" type="integer" label="Minimum score to match scheme" value="50" min="0" max="100" help="Minumum score out of 100 to match a scheme" optional="true" /> - </when> - <when value="basic"/> - </conditional> - - <param name="mlst_databases" label="Select a mlst database" type="select"> - <options from_data_table="mlst"> - <validator message="No database is available" type="no_options" /> - </options> - </param> - - </inputs> - <outputs> - <data format="fasta" name="results.skesa.fasta" label="${tool.name} on ${on_string}: Contigs" from_work_dir="*.fasta"/> - <data format="tabular" name="results.mlst.tsv" label="${tool.name} on ${on_string}: MLST" from_work_dir="*.mlst.tsv"/> - <data format="tabular" name="qc_results.tsv" label="${tool.name} on ${on_string}: MLST" from_work_dir="*.txt"/> - </outputs> - - <help><![CDATA[ - - ]]></help> - <citations> - <citation type="bibtex"> - @misc{pope_dashnow_zobel_holt_raven_schultz_inouye_tomita_2014, - title={skesa: eSKESA is a de-novo sequence read assembler for cultured single isolate genomes - based on DeBruijn graphs. It uses conservative heuristics and is designed to - create breaks at repeat regions in the genome. This leads to excellent sequence - quality but not necessarily a large N50 statistic. It is a multi-threaded - application that scales well with the number of processors. For different runs - with the same inputs, including the order of reads, the order and orientation - of contigs in the output is deterministic. }, - url={https://github.com/ncbi/ngs-tools/tree/master/tools/skesa/}, - author={National Center for Biotechnology Information }, - }</citation> - - <citation type="bibtex"> - @UNPUBLISHED{Seemann2016, - author = "Seemann T", - title = "MLST: Scan contig files against PubMLST typing schemes", - year = "2016", - url = {https://github.com/tseemann/mlst} - }</citation> - </citations> -</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/microrunqc/median_size.py Fri Feb 02 15:16:46 2024 +0000 @@ -0,0 +1,26 @@ +#!/usr/bin/env + +## Errol Strain (estrain@gmail.com) +## calculate median insert size from sam file + +import numpy as np + +def get_data(infile): + lengths = [] + for line in infile: + if line.startswith('@'): + pass + else: + line = line.rsplit() + length = int(line[8]) + if length > 0: + lengths.append(length) + else: + pass + return lengths + +if __name__ == "__main__": + import sys + lengths = get_data(sys.stdin) + md = int(np.median(lengths)) +print(md)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/microrunqc/microrunqc.xml Fri Feb 02 15:16:46 2024 +0000 @@ -0,0 +1,200 @@ +<tool id="microrunqc" name="microrunqc" version="1.0.3"> + + <requirements> + <requirement type="package" version="2.4.0">skesa</requirement> + <requirement type="package" version="2.23.0">mlst</requirement> + <requirement type="package" version="0.7.17">bwa</requirement> + <requirement type="package" version="1.0.1">fastq-scan</requirement> + </requirements> + + <command detect_errors="exit_code"><![CDATA[ + + skesa + + #set fqscan = "text" + #if $jobtype.select == "fastq_fr" + #set outname = $jobtype.fastq1.name + #set bwalist = str($jobtype.fastq1) + " " + str($jobtype.fastq2) + --fastq $jobtype.fastq1,$jobtype.fastq2 + #if $jobtype.fastq1.is_of_type("fastq.gz") + #set fqscan = "gz" + #else if $jobtype.fastq1.is_of_type("fastqsanger.gz") + #set fqscan = "gz" + #end if + #else if $jobtype.select == "fastq_pair" + #set outname = $jobtype.coll.name + #set bwalist = str($jobtype.coll.forward) + " " + str($jobtype.coll.reverse) + --fastq $jobtype.coll.forward,$jobtype.coll.reverse + #if $jobtype.coll.forward.is_of_type("fastq.gz") + #set fqscan = "gz" + #else if $jobtype.coll.forward.is_of_type("fastqsanger.gz") + #set fqscan = "gz" + #end if + #end if + + #set num_cores = 1 + + #if $options.select =="basic" + --cores $num_cores + --memory 8 + #else if $options.select=="advanced" + #if $options.cores + #set num_cores = $options.cores + --cores $options.cores + #end if + #if $options.memory + --memory $options.memory + #end if + #if $options.hash_count + --hash_count + #end if + #if $options.estimated_kmers + --estimated_kmers $options.estimated.kmers + #end if + #if $options.skip + --skip_bloom_filter + #end if + #if $options.kmer + --kmer $options.kmer + #end if + #if $options.min_count + --min_count $options.min_count + #end if + #if $options.max_kmer_count + --max_kmer_count $options.max_kmer_count + #end if + #if $options.vector_percent + --vector_percent $options.vector_percent + #end if + #if $options.insert_size + --insert_size $options.insert.size + #end if + #if $options.steps + --steps $options.steps + #end if + #if $options.fraction + --fraction $options.fraction + #end if + #if $options.max_snp_len + --max_snp_len $options.max_snp_len + #end if + #if $options.min_contig + --min_contig $options.min_contig + #end if + #if $options.allow_snps + --allow_snps + #end if + #end if + + > ${outname}.fasta; + + bwa index ${outname}.fasta; + bwa mem -t $num_cores ${outname}.fasta ${bwalist} | python $__tool_directory__/median_size.py > insert.median; + + mlst --nopath --threads $num_cores --datadir $mlst_databases.fields.path/pubmlst --blastdb $mlst_databases.fields.path/blast/mlst.fa + #if $options.select=="advanced" + #if $options.minid + --minid $options.minid + #end if + #if $options.mincov + --mincov $options.mincov + #end if + #if $options.minscore + --minscore $options.minscore + #end if + #end if + ${outname}.fasta > ${outname}.mlst_raw.tsv; + + python $__tool_directory__/mlstAddFields.py ${outname}.mlst_raw.tsv $mlst_databases.fields.path/pubmlst > ${outname}.mlst.tsv; + + python $__tool_directory__/run_fastq_scan.py --fastq ${bwalist} --out fq_out.tab --type ${fqscan}; + + python $__tool_directory__/sum_mlst.py --fasta ${outname}.fasta --mlst ${outname}.mlst.tsv --med insert.median --fqscan fq_out.tab --out sum_qc.txt + + ]]></command> + <inputs> + <conditional name="jobtype"> + <param name="select" type="select" label="Select Input"> + <option value="fastq_fr">Forward and Reverse FASTQ</option> + <option value="fastq_pair">Paired FASTQ Collection</option> + </param> + <when value="fastq_fr"> + <param name="fastq1" type="data" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz" label="Forward FASTQ" /> + <param name="fastq2" type="data" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz" label="Reverse FASTQ" /> + </when> + <when value="fastq_pair"> + <param name="coll" label="Paired FASTQ" type="data_collection" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz" collection_type="paired" /> + </when> + </conditional> + + <conditional name="options"> + <param name="select" type="select" label="Options Type"> + <option value="basic">Basic</option> + <option value="advanced">Advanced</option> + </param> + <when value="advanced"> + <param name="cores" optional="true" type="integer" label="Number of cores to use (Default=16)" value=""/> + <param name="memory" optional="true" type="integer" label="Memory available (Default=32GB)" value=""/> + <param name="hash_count" optional="true" type="boolean" label="hash counter"/> + <param name="estimated_kmers" optional="true" type="integer" label="Estimated number of unique kmers for bloom filter (Default=100)" value=""/> + <param name="skip" optional="true" type="boolean" label="skip bloom filter, use estimate kmers as the hash"/> + <param name="kmer" optional="true" type="integer" label="Minimal kmer length for assembly (Default=21)" value=""/> + <param name="min_count" optional="true" type="integer" label="Minimal count for kmers retained for comparing alternate choices" value=""/> + <param name="max_kmer_count" optional="true" type="integer" label="Minimum acceptable average count for estimating the maximal kmer length in reads" value=""/> + <param name="vector_percent" optional="true" type="float" label="Count for vectors as a fraction of the read number (0-1,1=disabled)" value=""> + <validator type="in_range" message="Must be float(0,1)." min="0" max="1"/> + </param> + <param name="insert_size" optional="true" type="integer" label="Expected insert size for paired reads" value=""/> + <param name="steps" optional="true" type="integer" label="Number of assembly iterations from minimal to maximal kmer length in reads (Default=11)" value=""/> + <param name="fraction" optional="true" type="float" label="Maximum noise to signal ratio acceptable for extension (Default=0.1)" value=""> + <validator type="in_range" message="Must be float(0,1)." min="0" max="1"/> + </param> + <param name="max_snp_len" optional="true" type="integer" label="Maximal snp length (Default=150)" value=""/> + <param name="min_contig" optional="true" type="integer" label="Minimal contig length reported in output (Default=200)" value=""/> + <param name="allow_snps" optional="true" type="boolean" label="Turn SNP discovery (Default=false)"/> + <param name="mincov" type="integer" label="Minimum DNA %coverage" value="10" help="Minimum DNA %coverage to report partial allele at all (default 10, must be between 0-100)" optional="true" /> + <param name="minid" type="integer" label="Minimum DNA %identity" value="95" min="0" max="100" help="Minimum DNA %identity of full allelle to consider 'similar' (default 95, must be between 0-100)" optional="true" /> + <param name="minscore" type="integer" label="Minimum score to match scheme" value="50" min="0" max="100" help="Minumum score out of 100 to match a scheme" optional="true" /> + </when> + <when value="basic"/> + </conditional> + + <param name="mlst_databases" label="Select a mlst database" type="select"> + <options from_data_table="mlst"> + <validator message="No database is available" type="no_options" /> + </options> + </param> + + </inputs> + <outputs> + <data format="fasta" name="results.skesa.fasta" label="${tool.name} on ${on_string}: Contigs" from_work_dir="*.fasta"/> + <data format="tabular" name="results.mlst.tsv" label="${tool.name} on ${on_string}: MLST" from_work_dir="*.mlst.tsv"/> + <data format="tabular" name="qc_results.tsv" label="${tool.name} on ${on_string}: MLST" from_work_dir="*.txt"/> + </outputs> + + <help><![CDATA[ + + ]]></help> + <citations> + <citation type="bibtex"> + @misc{pope_dashnow_zobel_holt_raven_schultz_inouye_tomita_2014, + title={skesa: eSKESA is a de-novo sequence read assembler for cultured single isolate genomes + based on DeBruijn graphs. It uses conservative heuristics and is designed to + create breaks at repeat regions in the genome. This leads to excellent sequence + quality but not necessarily a large N50 statistic. It is a multi-threaded + application that scales well with the number of processors. For different runs + with the same inputs, including the order of reads, the order and orientation + of contigs in the output is deterministic. }, + url={https://github.com/ncbi/ngs-tools/tree/master/tools/skesa/}, + author={National Center for Biotechnology Information }, + }</citation> + + <citation type="bibtex"> + @UNPUBLISHED{Seemann2016, + author = "Seemann T", + title = "MLST: Scan contig files against PubMLST typing schemes", + year = "2016", + url = {https://github.com/tseemann/mlst} + }</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/microrunqc/mlst.loc.sample Fri Feb 02 15:16:46 2024 +0000 @@ -0,0 +1,6 @@ +# this is a tab separated file describing the location of mlst databases +# +# the columns are: +# value name path +# +# for example
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/microrunqc/mlstAddFields.py Fri Feb 02 15:16:46 2024 +0000 @@ -0,0 +1,78 @@ +#!/usr/bin/env + +import sys +import csv + +def find_index(headers, term): + try: + return headers.index(term) + except ValueError: + return -1 + +def main(mlst_file, db_path=None): + with open(mlst_file, 'r') as file: + reader = csv.reader(file, delimiter='\t') + mlstout = next(reader) + + schema = mlstout[1] + mlstST = mlstout[2] + + # Return the output without appending if schema equals "-" + if schema == "-": + print("\t".join(mlstout)) + return + + if db_path is None: + # If no database path is provided, find it using an external command + # This requires the 'mlst' command to be installed and available in the path + import subprocess + mlstdesc = subprocess.check_output(['mlst', '-h']).decode() + db_pubmlst = [line for line in mlstdesc.split('\n') if 'db/pubmlst' in line] + if db_pubmlst: + mlstloc = db_pubmlst[0].split("'")[1].replace("bin/..", "") + else: + raise Exception("Could not find MLST database location.") + else: + mlstloc = db_path + + mlst_file_path = f"{mlstloc}/{schema}/{schema}.txt" + + schema_dict = {} + with open(mlst_file_path, 'r') as file: + reader = csv.reader(file, delimiter='\t') + headers = next(reader) + + clonal = find_index(headers, 'clonal_complex') + cc = find_index(headers, 'CC') + lineage = find_index(headers, 'Lineage') + species = find_index(headers, 'species') + + for line in reader: + desc = [] + if clonal > -1 and line[clonal]: + desc.append(f"clonal_complex={line[clonal]}") + if cc > -1 and line[cc]: + desc.append(f"CC={line[cc]}") + if lineage > -1 and line[lineage]: + desc.append(f"Lineage={line[lineage]}") + if species > -1 and line[species]: + desc.append(f"species={line[species]}") + schema_dict[line[0]] = ','.join(desc) + + output = mlstout[:3] + if mlstST in schema_dict: + output.append(schema_dict[mlstST]) + output.extend(mlstout[3:]) + + print("\t".join(output)) + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python mlstAddFields.py <mlst_file> [db_path]") + sys.exit(1) + + mlst_file = sys.argv[1] + db_path = sys.argv[2] if len(sys.argv) > 2 else None + + main(mlst_file, db_path) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/microrunqc/run_fastq_scan.py Fri Feb 02 15:16:46 2024 +0000 @@ -0,0 +1,54 @@ +#!/usr/bin/env + +## Run fastq-scan to get mean read length and mean quality score +## author: errol strain, estrain@gmail.com + +from argparse import (ArgumentParser, FileType) +import sys +import glob +import subprocess +import json + +def parse_args(): + "Parse the input arguments, use '-h' for help." + + parser = ArgumentParser(description='Run fastq-scan on a pair of gzipped FASTQ files') + + # Read inputs + parser.add_argument('--fastq', type=str, required=True, nargs=2, help='FASTQ files') + parser.add_argument('--output', type=str, required=True, nargs=1, help='Output File') + parser.add_argument('--type', type=str, required=True, nargs=1, help='File Type (text or gz)') + + return parser.parse_args() + +args =parse_args() + +# FASTA file +r1 = args.fastq[0] +r2 = args.fastq[1] + +# Read 1 +if str(args.type[0]) == "gz" : + cmd1 = ["zcat", r1] +else : + cmd1 = ["cat", r1] +cmd2 = ["fastq-scan"] +pcmd1= subprocess.Popen(cmd1,stdout= subprocess.PIPE,shell=False) +r1json = json.loads(subprocess.Popen(cmd2, stdin=pcmd1.stdout,stdout=subprocess.PIPE,shell=False).communicate()[0]) +r1q = round(r1json["qc_stats"]["qual_mean"],1) +r1l = round(r1json["qc_stats"]["read_mean"],1) + +# Read 2 +if str(args.type[0]) == "gz" : + cmd1 = ["zcat", r2] +else : + cmd1 = ["cat", r2] +cmd2 = ["fastq-scan"] +pcmd1= subprocess.Popen(cmd1,stdout= subprocess.PIPE,shell=False) +r2json = json.loads(subprocess.Popen(cmd2, stdin=pcmd1.stdout,stdout=subprocess.PIPE,shell=False).communicate()[0]) +r2q = round(r2json["qc_stats"]["qual_mean"],1) +r2l = round(r2json["qc_stats"]["read_mean"],1) + +# Write output to be used by sum_mlst.py +output = open(args.output[0],"w") +output.write(str(r1l) + "\t" + str(r2l) + "\t" + str(r1q) + "\t" + str(r2q))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/microrunqc/sum_mlst.py Fri Feb 02 15:16:46 2024 +0000 @@ -0,0 +1,79 @@ +#!/usr/bin/env + +## Generate basic summary stats from SKESA, fastq-scan, and MLST output. +## author: errol strain, estrain@gmail.com + +from argparse import (ArgumentParser, FileType) +import sys +import glob +import subprocess +from decimal import Decimal + +def parse_args(): + "Parse the input arguments, use '-h' for help." + + parser = ArgumentParser(description='Generate Basic Summary Statistics from SKESA assemblies, fastq-scan output, and MLST reports') + + # Read inputs + parser.add_argument('--fasta', type=str, required=True, nargs=1, help='SKESA FASTA assembly') + parser.add_argument('--mlst', type=str, required=True, nargs=1, help='MLST output') + parser.add_argument('--fqscan', type=str, required=True, nargs=1, help='fastq-scan output') + parser.add_argument('--med', type=str, required=True, nargs=1, help='Median Insert Size') + parser.add_argument('--output', type=str, required=True, nargs=1, help='Output File') + + return parser.parse_args() + +args =parse_args() + +# FASTA file +fasta = args.fasta[0] + +# Get individual and total length of contigs +cmd = ["awk", "/^>/ {if (seqlen){print seqlen}; ;seqlen=0;next; } { seqlen = seqlen +length($0)}END{print seqlen}",fasta] +seqlen = subprocess.Popen(cmd,stdout= subprocess.PIPE).communicate()[0] +intlen = list(map(int,seqlen.splitlines())) +totlen = sum(intlen) +# Count number of contigs +numtigs = len(intlen) + +# Get coverage information from skesa fasta header +cmd1 = ["grep",">",fasta] +cmd2 = ["cut","-f","3","-d","_"] +p1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE) +p2 = subprocess.Popen(cmd2, stdin=p1.stdout, stdout=subprocess.PIPE).communicate()[0] +covdep = map(float,p2.splitlines()) +covlist = [a*b for a,b in zip([float(i) for i in intlen],covdep)] +covdep = round(sum(covlist)/totlen,1) + +# Calculate N50 +vals = [int(i) for i in intlen] +vals.sort(reverse=True) +n50=0 +for counter in range(0,len(vals)-1): + if sum(vals[0:counter]) > (totlen/2): + n50=vals[counter-1] + break + +# Read in MLST output +mlst = open(args.mlst[0],"r") +profile = mlst.readline() +els = profile.split("\t") + +# Read in median insert size +medfile = open(args.med[0],"r") +insert = medfile.readline() +insert = insert.rstrip() + +# Read in fastq-scan +fqfile = open(args.fqscan[0],"r") +fq = fqfile.readline() +fq = fq.rstrip() + +output = open(args.output[0],"w") + +filehead = str("File\tContigs\tLength\tEstCov\tN50\tMedianInsert\tMeanLength_R1\tMeanLength_R2\tMeanQ_R1\tMeanQ_R2\tScheme\tST\n") +output.write(filehead) + +output.write(str(fasta) + "\t" + str(numtigs) + "\t" + str(totlen) + "\t" + str(covdep) + "\t" + str(n50) +"\t" + str(insert) + "\t" + str(fq)) +for counter in range(1,len(els)): + output.write("\t" + str(els[counter]))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/microrunqc/tool_data_table_conf.xml.sample Fri Feb 02 15:16:46 2024 +0000 @@ -0,0 +1,7 @@ +<tables> + <!-- Locations of amrfinderplus database in the required format --> + <table name="mlst" comment_char="#" allow_duplicate_entries="False"> + <columns>value, name, path</columns> + <file path="tool-data/mlst.loc" /> + </table> +</tables>
--- a/mlst.loc.sample Fri Jan 19 11:50:31 2024 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -# this is a tab separated file describing the location of mlst databases -# -# the columns are: -# value name path -# -# for example
--- a/mlstAddFields.py Fri Jan 19 11:50:31 2024 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,73 +0,0 @@ -#!/usr/bin/env - -import sys -import csv - -def find_index(headers, term): - try: - return headers.index(term) - except ValueError: - return -1 - -def main(mlst_file, db_path=None): - with open(mlst_file, 'r') as file: - reader = csv.reader(file, delimiter='\t') - mlstout = next(reader) - - schema = mlstout[1] - mlstST = mlstout[2] - - if db_path is None: - # If no database path is provided, find it using an external command - # This requires the 'mlst' command to be installed and available in the path - import subprocess - mlstdesc = subprocess.check_output(['mlst', '-h']).decode() - db_pubmlst = [line for line in mlstdesc.split('\n') if 'db/pubmlst' in line] - if db_pubmlst: - mlstloc = db_pubmlst[0].split("'")[1].replace("bin/..", "") - else: - raise Exception("Could not find MLST database location.") - else: - mlstloc = db_path - - mlst_file_path = f"{mlstloc}/{schema}/{schema}.txt" - - schema_dict = {} - with open(mlst_file_path, 'r') as file: - reader = csv.reader(file, delimiter='\t') - headers = next(reader) - - clonal = find_index(headers, 'clonal_complex') - cc = find_index(headers, 'CC') - lineage = find_index(headers, 'Lineage') - species = find_index(headers, 'species') - - for line in reader: - desc = [] - if clonal > -1 and line[clonal]: - desc.append(f"clonal_complex={line[clonal]}") - if cc > -1 and line[cc]: - desc.append(f"CC={line[cc]}") - if lineage > -1 and line[lineage]: - desc.append(f"Lineage={line[lineage]}") - if species > -1 and line[species]: - desc.append(f"species={line[species]}") - schema_dict[line[0]] = ','.join(desc) - - output = mlstout[:3] - if mlstST in schema_dict: - output.append(schema_dict[mlstST]) - output.extend(mlstout[3:]) - - print("\t".join(output)) - -if __name__ == "__main__": - if len(sys.argv) < 2: - print("Usage: python mlstAddFields.py <mlst_file> [db_path]") - sys.exit(1) - - mlst_file = sys.argv[1] - db_path = sys.argv[2] if len(sys.argv) > 2 else None - - main(mlst_file, db_path) -
--- a/run_fastq_scan.py Fri Jan 19 11:50:31 2024 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,54 +0,0 @@ -#!/usr/bin/env - -## Run fastq-scan to get mean read length and mean quality score -## author: errol strain, estrain@gmail.com - -from argparse import (ArgumentParser, FileType) -import sys -import glob -import subprocess -import json - -def parse_args(): - "Parse the input arguments, use '-h' for help." - - parser = ArgumentParser(description='Run fastq-scan on a pair of gzipped FASTQ files') - - # Read inputs - parser.add_argument('--fastq', type=str, required=True, nargs=2, help='FASTQ files') - parser.add_argument('--output', type=str, required=True, nargs=1, help='Output File') - parser.add_argument('--type', type=str, required=True, nargs=1, help='File Type (text or gz)') - - return parser.parse_args() - -args =parse_args() - -# FASTA file -r1 = args.fastq[0] -r2 = args.fastq[1] - -# Read 1 -if str(args.type[0]) == "gz" : - cmd1 = ["zcat", r1] -else : - cmd1 = ["cat", r1] -cmd2 = ["fastq-scan"] -pcmd1= subprocess.Popen(cmd1,stdout= subprocess.PIPE,shell=False) -r1json = json.loads(subprocess.Popen(cmd2, stdin=pcmd1.stdout,stdout=subprocess.PIPE,shell=False).communicate()[0]) -r1q = round(r1json["qc_stats"]["qual_mean"],1) -r1l = round(r1json["qc_stats"]["read_mean"],1) - -# Read 2 -if str(args.type[0]) == "gz" : - cmd1 = ["zcat", r2] -else : - cmd1 = ["cat", r2] -cmd2 = ["fastq-scan"] -pcmd1= subprocess.Popen(cmd1,stdout= subprocess.PIPE,shell=False) -r2json = json.loads(subprocess.Popen(cmd2, stdin=pcmd1.stdout,stdout=subprocess.PIPE,shell=False).communicate()[0]) -r2q = round(r2json["qc_stats"]["qual_mean"],1) -r2l = round(r2json["qc_stats"]["read_mean"],1) - -# Write output to be used by sum_mlst.py -output = open(args.output[0],"w") -output.write(str(r1l) + "\t" + str(r2l) + "\t" + str(r1q) + "\t" + str(r2q))
--- a/sum_mlst.py Fri Jan 19 11:50:31 2024 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,79 +0,0 @@ -#!/usr/bin/env - -## Generate basic summary stats from SKESA, fastq-scan, and MLST output. -## author: errol strain, estrain@gmail.com - -from argparse import (ArgumentParser, FileType) -import sys -import glob -import subprocess -from decimal import Decimal - -def parse_args(): - "Parse the input arguments, use '-h' for help." - - parser = ArgumentParser(description='Generate Basic Summary Statistics from SKESA assemblies, fastq-scan output, and MLST reports') - - # Read inputs - parser.add_argument('--fasta', type=str, required=True, nargs=1, help='SKESA FASTA assembly') - parser.add_argument('--mlst', type=str, required=True, nargs=1, help='MLST output') - parser.add_argument('--fqscan', type=str, required=True, nargs=1, help='fastq-scan output') - parser.add_argument('--med', type=str, required=True, nargs=1, help='Median Insert Size') - parser.add_argument('--output', type=str, required=True, nargs=1, help='Output File') - - return parser.parse_args() - -args =parse_args() - -# FASTA file -fasta = args.fasta[0] - -# Get individual and total length of contigs -cmd = ["awk", "/^>/ {if (seqlen){print seqlen}; ;seqlen=0;next; } { seqlen = seqlen +length($0)}END{print seqlen}",fasta] -seqlen = subprocess.Popen(cmd,stdout= subprocess.PIPE).communicate()[0] -intlen = list(map(int,seqlen.splitlines())) -totlen = sum(intlen) -# Count number of contigs -numtigs = len(intlen) - -# Get coverage information from skesa fasta header -cmd1 = ["grep",">",fasta] -cmd2 = ["cut","-f","3","-d","_"] -p1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE) -p2 = subprocess.Popen(cmd2, stdin=p1.stdout, stdout=subprocess.PIPE).communicate()[0] -covdep = map(float,p2.splitlines()) -covlist = [a*b for a,b in zip([float(i) for i in intlen],covdep)] -covdep = round(sum(covlist)/totlen,1) - -# Calculate N50 -vals = [int(i) for i in intlen] -vals.sort(reverse=True) -n50=0 -for counter in range(0,len(vals)-1): - if sum(vals[0:counter]) > (totlen/2): - n50=vals[counter-1] - break - -# Read in MLST output -mlst = open(args.mlst[0],"r") -profile = mlst.readline() -els = profile.split("\t") - -# Read in median insert size -medfile = open(args.med[0],"r") -insert = medfile.readline() -insert = insert.rstrip() - -# Read in fastq-scan -fqfile = open(args.fqscan[0],"r") -fq = fqfile.readline() -fq = fq.rstrip() - -output = open(args.output[0],"w") - -filehead = str("File\tContigs\tLength\tEstCov\tN50\tMedianInsert\tMeanLength_R1\tMeanLength_R2\tMeanQ_R1\tMeanQ_R2\tScheme\tST\n") -output.write(filehead) - -output.write(str(fasta) + "\t" + str(numtigs) + "\t" + str(totlen) + "\t" + str(covdep) + "\t" + str(n50) +"\t" + str(insert) + "\t" + str(fq)) -for counter in range(1,len(els)): - output.write("\t" + str(els[counter]))
--- a/tool_data_table_conf.xml.sample Fri Jan 19 11:50:31 2024 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ -<tables> - <!-- Locations of amrfinderplus database in the required format --> - <table name="mlst" comment_char="#" allow_duplicate_entries="False"> - <columns>value, name, path</columns> - <file path="tool-data/mlst.loc" /> - </table> -</tables>