changeset 24:dc12d6ac296d draft

Fixed bug in MLST ouput
author estrain
date Fri, 02 Feb 2024 15:16:46 +0000
parents 7272a9c36149
children 4f457838f4c9
files median_size.py microrunqc.xml microrunqc/median_size.py microrunqc/microrunqc.xml microrunqc/mlst.loc.sample microrunqc/mlstAddFields.py microrunqc/run_fastq_scan.py microrunqc/sum_mlst.py microrunqc/tool_data_table_conf.xml.sample mlst.loc.sample mlstAddFields.py run_fastq_scan.py sum_mlst.py tool_data_table_conf.xml.sample
diffstat 14 files changed, 450 insertions(+), 445 deletions(-) [+]
line wrap: on
line diff
--- a/median_size.py	Fri Jan 19 11:50:31 2024 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,26 +0,0 @@
-#!/usr/bin/env
-
-## Errol Strain (estrain@gmail.com)
-## calculate median insert size from sam file
-
-import numpy as np
-
-def get_data(infile):
-    lengths = []
-    for line in infile:
-        if line.startswith('@'):
-            pass
-        else:
-            line = line.rsplit()
-            length = int(line[8])
-            if length > 0:
-                lengths.append(length)
-            else:
-                pass
-    return lengths
-
-if __name__ == "__main__":
-    import sys
-    lengths = get_data(sys.stdin)
-    md = int(np.median(lengths))
-print(md)
--- a/microrunqc.xml	Fri Jan 19 11:50:31 2024 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,200 +0,0 @@
-<tool id="microrunqc" name="microrunqc" version="1.0.2">
-
-  <requirements>
-    <requirement type="package" version="2.4.0">skesa</requirement>
-    <requirement type="package" version="2.23.0">mlst</requirement>
-    <requirement type="package" version="0.7.17">bwa</requirement>
-    <requirement type="package" version="1.0.1">fastq-scan</requirement>
-  </requirements>
-
-  <command detect_errors="exit_code"><![CDATA[
-
-    skesa
-
-    #set fqscan = "text"
-    #if $jobtype.select == "fastq_fr"
-      #set outname = $jobtype.fastq1.name
-      #set bwalist = str($jobtype.fastq1) + " " + str($jobtype.fastq2)
-      --fastq $jobtype.fastq1,$jobtype.fastq2
-      #if $jobtype.fastq1.is_of_type("fastq.gz")
-        #set fqscan = "gz"
-      #else if $jobtype.fastq1.is_of_type("fastqsanger.gz")
-        #set fqscan = "gz"
-      #end if
-    #else if $jobtype.select == "fastq_pair"
-      #set outname = $jobtype.coll.name
-      #set bwalist = str($jobtype.coll.forward) + " " + str($jobtype.coll.reverse)
-      --fastq $jobtype.coll.forward,$jobtype.coll.reverse
-      #if $jobtype.coll.forward.is_of_type("fastq.gz")
-        #set fqscan = "gz"
-      #else if $jobtype.coll.forward.is_of_type("fastqsanger.gz")
-        #set fqscan = "gz"
-      #end if
-    #end if
-
-    #set num_cores = 1
-
-    #if $options.select =="basic"
-      --cores $num_cores 
-      --memory 8 
-    #else if $options.select=="advanced"
-      #if $options.cores
-        #set num_cores = $options.cores
-        --cores $options.cores
-      #end if
-      #if $options.memory
-        --memory $options.memory
-      #end if
-      #if $options.hash_count
-        --hash_count
-      #end if
-      #if $options.estimated_kmers
-        --estimated_kmers $options.estimated.kmers
-      #end if
-      #if $options.skip
-       --skip_bloom_filter
-      #end if
-      #if $options.kmer
-       --kmer $options.kmer 
-      #end if
-      #if $options.min_count
-       --min_count $options.min_count 
-      #end if
-      #if $options.max_kmer_count
-       --max_kmer_count $options.max_kmer_count 
-      #end if
-      #if $options.vector_percent
-       --vector_percent $options.vector_percent 
-      #end if
-      #if $options.insert_size
-       --insert_size $options.insert.size 
-      #end if
-      #if $options.steps
-       --steps $options.steps 
-      #end if
-      #if $options.fraction
-       --fraction $options.fraction 
-      #end if
-      #if $options.max_snp_len
-       --max_snp_len $options.max_snp_len 
-      #end if
-      #if $options.min_contig
-       --min_contig $options.min_contig 
-      #end if
-      #if $options.allow_snps
-       --allow_snps 
-      #end if
-    #end if
-
-    > ${outname}.fasta; 
-
-    bwa index ${outname}.fasta;
-    bwa mem -t $num_cores  ${outname}.fasta ${bwalist} | python $__tool_directory__/median_size.py > insert.median;
- 
-    mlst --nopath --threads $num_cores --datadir $mlst_databases.fields.path/pubmlst --blastdb $mlst_databases.fields.path/blast/mlst.fa  
-    #if $options.select=="advanced"
-      #if $options.minid
-        --minid $options.minid
-      #end if
-      #if $options.mincov
-        --mincov $options.mincov
-      #end if
-      #if $options.minscore
-        --minscore $options.minscore
-      #end if
-    #end if
-    ${outname}.fasta > ${outname}.mlst_raw.tsv;
-    
-    python $__tool_directory__/mlstAddFields.py ${outname}.mlst_raw.tsv $mlst_databases.fields.path/pubmlst > ${outname}.mlst.tsv;
-
-    python $__tool_directory__/run_fastq_scan.py --fastq ${bwalist} --out fq_out.tab --type ${fqscan};
-
-    python $__tool_directory__/sum_mlst.py --fasta ${outname}.fasta --mlst ${outname}.mlst.tsv --med insert.median --fqscan fq_out.tab --out sum_qc.txt 
-
-  ]]></command>
-    <inputs>
-      <conditional name="jobtype">
-        <param name="select" type="select" label="Select Input">
-          <option value="fastq_fr">Forward and Reverse FASTQ</option>
-          <option value="fastq_pair">Paired FASTQ Collection</option>
-        </param>
-        <when value="fastq_fr">
-          <param name="fastq1" type="data" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz" label="Forward FASTQ" />
-          <param name="fastq2" type="data" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz" label="Reverse FASTQ" />
-        </when>
-        <when value="fastq_pair">
-          <param name="coll" label="Paired FASTQ" type="data_collection" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz" collection_type="paired" />
-        </when>
-      </conditional>
-  
-      <conditional name="options">
-        <param name="select" type="select" label="Options Type">
-          <option value="basic">Basic</option>
-          <option value="advanced">Advanced</option>
-        </param>
-        <when value="advanced">
-          <param name="cores" optional="true" type="integer" label="Number of cores to use (Default=16)" value=""/>
-          <param name="memory" optional="true" type="integer" label="Memory available (Default=32GB)" value=""/>
-          <param name="hash_count" optional="true" type="boolean" label="hash counter"/>
-          <param name="estimated_kmers" optional="true" type="integer" label="Estimated number of unique kmers for bloom filter (Default=100)" value=""/>
-          <param name="skip" optional="true" type="boolean" label="skip bloom filter, use estimate kmers as the hash"/>
-          <param name="kmer" optional="true" type="integer" label="Minimal kmer length for assembly (Default=21)" value=""/>
-          <param name="min_count" optional="true" type="integer" label="Minimal count for kmers retained for comparing alternate choices" value=""/>
-          <param name="max_kmer_count" optional="true" type="integer" label="Minimum acceptable average count for estimating the maximal kmer length in reads" value=""/>
-          <param name="vector_percent" optional="true" type="float" label="Count for vectors as a fraction of the read number (0-1,1=disabled)"  value="">
-            <validator type="in_range" message="Must be float(0,1)." min="0" max="1"/>
-          </param>
-          <param name="insert_size" optional="true" type="integer" label="Expected insert size for paired reads" value=""/>
-          <param name="steps" optional="true" type="integer" label="Number of assembly iterations from minimal to maximal kmer length in reads (Default=11)" value=""/>
-          <param name="fraction" optional="true" type="float" label="Maximum noise to signal ratio acceptable for extension (Default=0.1)" value="">
-            <validator type="in_range" message="Must be float(0,1)." min="0" max="1"/>
-          </param>
-          <param name="max_snp_len" optional="true" type="integer" label="Maximal snp length (Default=150)" value=""/>
-          <param name="min_contig" optional="true" type="integer" label="Minimal contig length reported in output (Default=200)" value=""/>
-          <param name="allow_snps" optional="true" type="boolean" label="Turn SNP discovery (Default=false)"/>
-          <param name="mincov" type="integer" label="Minimum DNA %coverage" value="10" help="Minimum DNA %coverage to report partial allele at all (default 10, must be between 0-100)" optional="true" />
-          <param name="minid" type="integer" label="Minimum DNA %identity" value="95" min="0" max="100" help="Minimum DNA %identity of full allelle to consider 'similar' (default 95, must be between 0-100)" optional="true" />
-          <param name="minscore" type="integer" label="Minimum score to match scheme" value="50" min="0" max="100" help="Minumum score out of 100 to match a scheme" optional="true" />
-        </when>
-        <when value="basic"/>
-      </conditional>
-
-      <param name="mlst_databases" label="Select a mlst database" type="select">
-          <options from_data_table="mlst">
-              <validator message="No database is available" type="no_options" />
-          </options>
-      </param>
-
-    </inputs>
-    <outputs>
-      <data format="fasta" name="results.skesa.fasta" label="${tool.name} on ${on_string}: Contigs" from_work_dir="*.fasta"/>
-      <data format="tabular" name="results.mlst.tsv" label="${tool.name} on ${on_string}: MLST" from_work_dir="*.mlst.tsv"/>
-      <data format="tabular" name="qc_results.tsv" label="${tool.name} on ${on_string}: MLST" from_work_dir="*.txt"/>
-    </outputs>
-
-    <help><![CDATA[
-
-    ]]></help>
-     <citations>
-        <citation type="bibtex">
-        @misc{pope_dashnow_zobel_holt_raven_schultz_inouye_tomita_2014,
-        title={skesa: eSKESA is a de-novo sequence read assembler for cultured single isolate genomes
-    based on DeBruijn graphs. It uses conservative heuristics and is designed to
-    create breaks at repeat regions in the genome. This leads to excellent sequence
-    quality but not necessarily a large N50 statistic. It is a multi-threaded
-    application that scales well with the number of processors. For different runs
-    with the same inputs, including the order of reads, the order and orientation
-    of contigs in the output is deterministic. },
-        url={https://github.com/ncbi/ngs-tools/tree/master/tools/skesa/},
-        author={National Center for Biotechnology Information },
-       }</citation>
-
-       <citation type="bibtex">
-       @UNPUBLISHED{Seemann2016,
-       author = "Seemann T",
-       title = "MLST: Scan contig files against PubMLST typing schemes",
-       year = "2016",
-       url = {https://github.com/tseemann/mlst}
-      }</citation>
-    </citations>
-</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/microrunqc/median_size.py	Fri Feb 02 15:16:46 2024 +0000
@@ -0,0 +1,26 @@
+#!/usr/bin/env
+
+## Errol Strain (estrain@gmail.com)
+## calculate median insert size from sam file
+
+import numpy as np
+
+def get_data(infile):
+    lengths = []
+    for line in infile:
+        if line.startswith('@'):
+            pass
+        else:
+            line = line.rsplit()
+            length = int(line[8])
+            if length > 0:
+                lengths.append(length)
+            else:
+                pass
+    return lengths
+
+if __name__ == "__main__":
+    import sys
+    lengths = get_data(sys.stdin)
+    md = int(np.median(lengths))
+print(md)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/microrunqc/microrunqc.xml	Fri Feb 02 15:16:46 2024 +0000
@@ -0,0 +1,200 @@
+<tool id="microrunqc" name="microrunqc" version="1.0.3">
+
+  <requirements>
+    <requirement type="package" version="2.4.0">skesa</requirement>
+    <requirement type="package" version="2.23.0">mlst</requirement>
+    <requirement type="package" version="0.7.17">bwa</requirement>
+    <requirement type="package" version="1.0.1">fastq-scan</requirement>
+  </requirements>
+
+  <command detect_errors="exit_code"><![CDATA[
+
+    skesa
+
+    #set fqscan = "text"
+    #if $jobtype.select == "fastq_fr"
+      #set outname = $jobtype.fastq1.name
+      #set bwalist = str($jobtype.fastq1) + " " + str($jobtype.fastq2)
+      --fastq $jobtype.fastq1,$jobtype.fastq2
+      #if $jobtype.fastq1.is_of_type("fastq.gz")
+        #set fqscan = "gz"
+      #else if $jobtype.fastq1.is_of_type("fastqsanger.gz")
+        #set fqscan = "gz"
+      #end if
+    #else if $jobtype.select == "fastq_pair"
+      #set outname = $jobtype.coll.name
+      #set bwalist = str($jobtype.coll.forward) + " " + str($jobtype.coll.reverse)
+      --fastq $jobtype.coll.forward,$jobtype.coll.reverse
+      #if $jobtype.coll.forward.is_of_type("fastq.gz")
+        #set fqscan = "gz"
+      #else if $jobtype.coll.forward.is_of_type("fastqsanger.gz")
+        #set fqscan = "gz"
+      #end if
+    #end if
+
+    #set num_cores = 1
+
+    #if $options.select =="basic"
+      --cores $num_cores 
+      --memory 8 
+    #else if $options.select=="advanced"
+      #if $options.cores
+        #set num_cores = $options.cores
+        --cores $options.cores
+      #end if
+      #if $options.memory
+        --memory $options.memory
+      #end if
+      #if $options.hash_count
+        --hash_count
+      #end if
+      #if $options.estimated_kmers
+        --estimated_kmers $options.estimated.kmers
+      #end if
+      #if $options.skip
+       --skip_bloom_filter
+      #end if
+      #if $options.kmer
+       --kmer $options.kmer 
+      #end if
+      #if $options.min_count
+       --min_count $options.min_count 
+      #end if
+      #if $options.max_kmer_count
+       --max_kmer_count $options.max_kmer_count 
+      #end if
+      #if $options.vector_percent
+       --vector_percent $options.vector_percent 
+      #end if
+      #if $options.insert_size
+       --insert_size $options.insert.size 
+      #end if
+      #if $options.steps
+       --steps $options.steps 
+      #end if
+      #if $options.fraction
+       --fraction $options.fraction 
+      #end if
+      #if $options.max_snp_len
+       --max_snp_len $options.max_snp_len 
+      #end if
+      #if $options.min_contig
+       --min_contig $options.min_contig 
+      #end if
+      #if $options.allow_snps
+       --allow_snps 
+      #end if
+    #end if
+
+    > ${outname}.fasta; 
+
+    bwa index ${outname}.fasta;
+    bwa mem -t $num_cores  ${outname}.fasta ${bwalist} | python $__tool_directory__/median_size.py > insert.median;
+ 
+    mlst --nopath --threads $num_cores --datadir $mlst_databases.fields.path/pubmlst --blastdb $mlst_databases.fields.path/blast/mlst.fa  
+    #if $options.select=="advanced"
+      #if $options.minid
+        --minid $options.minid
+      #end if
+      #if $options.mincov
+        --mincov $options.mincov
+      #end if
+      #if $options.minscore
+        --minscore $options.minscore
+      #end if
+    #end if
+    ${outname}.fasta > ${outname}.mlst_raw.tsv;
+    
+    python $__tool_directory__/mlstAddFields.py ${outname}.mlst_raw.tsv $mlst_databases.fields.path/pubmlst > ${outname}.mlst.tsv;
+
+    python $__tool_directory__/run_fastq_scan.py --fastq ${bwalist} --out fq_out.tab --type ${fqscan};
+
+    python $__tool_directory__/sum_mlst.py --fasta ${outname}.fasta --mlst ${outname}.mlst.tsv --med insert.median --fqscan fq_out.tab --out sum_qc.txt 
+
+  ]]></command>
+    <inputs>
+      <conditional name="jobtype">
+        <param name="select" type="select" label="Select Input">
+          <option value="fastq_fr">Forward and Reverse FASTQ</option>
+          <option value="fastq_pair">Paired FASTQ Collection</option>
+        </param>
+        <when value="fastq_fr">
+          <param name="fastq1" type="data" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz" label="Forward FASTQ" />
+          <param name="fastq2" type="data" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz" label="Reverse FASTQ" />
+        </when>
+        <when value="fastq_pair">
+          <param name="coll" label="Paired FASTQ" type="data_collection" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz" collection_type="paired" />
+        </when>
+      </conditional>
+  
+      <conditional name="options">
+        <param name="select" type="select" label="Options Type">
+          <option value="basic">Basic</option>
+          <option value="advanced">Advanced</option>
+        </param>
+        <when value="advanced">
+          <param name="cores" optional="true" type="integer" label="Number of cores to use (Default=16)" value=""/>
+          <param name="memory" optional="true" type="integer" label="Memory available (Default=32GB)" value=""/>
+          <param name="hash_count" optional="true" type="boolean" label="hash counter"/>
+          <param name="estimated_kmers" optional="true" type="integer" label="Estimated number of unique kmers for bloom filter (Default=100)" value=""/>
+          <param name="skip" optional="true" type="boolean" label="skip bloom filter, use estimate kmers as the hash"/>
+          <param name="kmer" optional="true" type="integer" label="Minimal kmer length for assembly (Default=21)" value=""/>
+          <param name="min_count" optional="true" type="integer" label="Minimal count for kmers retained for comparing alternate choices" value=""/>
+          <param name="max_kmer_count" optional="true" type="integer" label="Minimum acceptable average count for estimating the maximal kmer length in reads" value=""/>
+          <param name="vector_percent" optional="true" type="float" label="Count for vectors as a fraction of the read number (0-1,1=disabled)"  value="">
+            <validator type="in_range" message="Must be float(0,1)." min="0" max="1"/>
+          </param>
+          <param name="insert_size" optional="true" type="integer" label="Expected insert size for paired reads" value=""/>
+          <param name="steps" optional="true" type="integer" label="Number of assembly iterations from minimal to maximal kmer length in reads (Default=11)" value=""/>
+          <param name="fraction" optional="true" type="float" label="Maximum noise to signal ratio acceptable for extension (Default=0.1)" value="">
+            <validator type="in_range" message="Must be float(0,1)." min="0" max="1"/>
+          </param>
+          <param name="max_snp_len" optional="true" type="integer" label="Maximal snp length (Default=150)" value=""/>
+          <param name="min_contig" optional="true" type="integer" label="Minimal contig length reported in output (Default=200)" value=""/>
+          <param name="allow_snps" optional="true" type="boolean" label="Turn SNP discovery (Default=false)"/>
+          <param name="mincov" type="integer" label="Minimum DNA %coverage" value="10" help="Minimum DNA %coverage to report partial allele at all (default 10, must be between 0-100)" optional="true" />
+          <param name="minid" type="integer" label="Minimum DNA %identity" value="95" min="0" max="100" help="Minimum DNA %identity of full allelle to consider 'similar' (default 95, must be between 0-100)" optional="true" />
+          <param name="minscore" type="integer" label="Minimum score to match scheme" value="50" min="0" max="100" help="Minumum score out of 100 to match a scheme" optional="true" />
+        </when>
+        <when value="basic"/>
+      </conditional>
+
+      <param name="mlst_databases" label="Select a mlst database" type="select">
+          <options from_data_table="mlst">
+              <validator message="No database is available" type="no_options" />
+          </options>
+      </param>
+
+    </inputs>
+    <outputs>
+      <data format="fasta" name="results.skesa.fasta" label="${tool.name} on ${on_string}: Contigs" from_work_dir="*.fasta"/>
+      <data format="tabular" name="results.mlst.tsv" label="${tool.name} on ${on_string}: MLST" from_work_dir="*.mlst.tsv"/>
+      <data format="tabular" name="qc_results.tsv" label="${tool.name} on ${on_string}: MLST" from_work_dir="*.txt"/>
+    </outputs>
+
+    <help><![CDATA[
+
+    ]]></help>
+     <citations>
+        <citation type="bibtex">
+        @misc{pope_dashnow_zobel_holt_raven_schultz_inouye_tomita_2014,
+        title={skesa: eSKESA is a de-novo sequence read assembler for cultured single isolate genomes
+    based on DeBruijn graphs. It uses conservative heuristics and is designed to
+    create breaks at repeat regions in the genome. This leads to excellent sequence
+    quality but not necessarily a large N50 statistic. It is a multi-threaded
+    application that scales well with the number of processors. For different runs
+    with the same inputs, including the order of reads, the order and orientation
+    of contigs in the output is deterministic. },
+        url={https://github.com/ncbi/ngs-tools/tree/master/tools/skesa/},
+        author={National Center for Biotechnology Information },
+       }</citation>
+
+       <citation type="bibtex">
+       @UNPUBLISHED{Seemann2016,
+       author = "Seemann T",
+       title = "MLST: Scan contig files against PubMLST typing schemes",
+       year = "2016",
+       url = {https://github.com/tseemann/mlst}
+      }</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/microrunqc/mlst.loc.sample	Fri Feb 02 15:16:46 2024 +0000
@@ -0,0 +1,6 @@
+# this is a tab separated file describing the location of mlst databases
+#
+# the columns are:
+# value name path
+#
+# for example
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/microrunqc/mlstAddFields.py	Fri Feb 02 15:16:46 2024 +0000
@@ -0,0 +1,78 @@
+#!/usr/bin/env 
+
+import sys
+import csv
+
+def find_index(headers, term):
+    try:
+        return headers.index(term)
+    except ValueError:
+        return -1
+
+def main(mlst_file, db_path=None):
+    with open(mlst_file, 'r') as file:
+        reader = csv.reader(file, delimiter='\t')
+        mlstout = next(reader)
+
+    schema = mlstout[1]
+    mlstST = mlstout[2]
+
+    # Return the output without appending if schema equals "-"
+    if schema == "-":
+        print("\t".join(mlstout))
+        return 
+
+    if db_path is None:
+        # If no database path is provided, find it using an external command
+        # This requires the 'mlst' command to be installed and available in the path
+        import subprocess
+        mlstdesc = subprocess.check_output(['mlst', '-h']).decode()
+        db_pubmlst = [line for line in mlstdesc.split('\n') if 'db/pubmlst' in line]
+        if db_pubmlst:
+            mlstloc = db_pubmlst[0].split("'")[1].replace("bin/..", "")
+        else:
+            raise Exception("Could not find MLST database location.")
+    else:
+        mlstloc = db_path
+
+    mlst_file_path = f"{mlstloc}/{schema}/{schema}.txt"
+
+    schema_dict = {}
+    with open(mlst_file_path, 'r') as file:
+        reader = csv.reader(file, delimiter='\t')
+        headers = next(reader)
+
+        clonal = find_index(headers, 'clonal_complex')
+        cc = find_index(headers, 'CC')
+        lineage = find_index(headers, 'Lineage')
+        species = find_index(headers, 'species')
+
+        for line in reader:
+            desc = []
+            if clonal > -1 and line[clonal]:
+                desc.append(f"clonal_complex={line[clonal]}")
+            if cc > -1 and line[cc]:
+                desc.append(f"CC={line[cc]}")
+            if lineage > -1 and line[lineage]:
+                desc.append(f"Lineage={line[lineage]}")
+            if species > -1 and line[species]:
+                desc.append(f"species={line[species]}")
+            schema_dict[line[0]] = ','.join(desc)
+
+    output = mlstout[:3]
+    if mlstST in schema_dict:
+        output.append(schema_dict[mlstST])
+    output.extend(mlstout[3:])
+
+    print("\t".join(output))
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python mlstAddFields.py <mlst_file> [db_path]")
+        sys.exit(1)
+
+    mlst_file = sys.argv[1]
+    db_path = sys.argv[2] if len(sys.argv) > 2 else None
+
+    main(mlst_file, db_path)
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/microrunqc/run_fastq_scan.py	Fri Feb 02 15:16:46 2024 +0000
@@ -0,0 +1,54 @@
+#!/usr/bin/env
+
+## Run fastq-scan to get mean read length and mean quality score  
+## author: errol strain, estrain@gmail.com
+
+from argparse import (ArgumentParser, FileType)
+import sys
+import glob 
+import subprocess
+import json
+
+def parse_args():
+  "Parse the input arguments, use '-h' for help."
+
+  parser = ArgumentParser(description='Run fastq-scan on a pair of gzipped FASTQ files')
+
+  # Read inputs
+  parser.add_argument('--fastq', type=str, required=True, nargs=2, help='FASTQ files')
+  parser.add_argument('--output', type=str, required=True, nargs=1, help='Output File')
+  parser.add_argument('--type', type=str, required=True, nargs=1, help='File Type (text or gz)')
+
+  return parser.parse_args()
+
+args =parse_args()
+
+# FASTA file
+r1 = args.fastq[0]
+r2 = args.fastq[1]
+
+# Read 1 
+if str(args.type[0]) == "gz" :
+  cmd1 = ["zcat", r1]
+else :
+  cmd1 = ["cat", r1]
+cmd2 = ["fastq-scan"]
+pcmd1= subprocess.Popen(cmd1,stdout= subprocess.PIPE,shell=False)
+r1json = json.loads(subprocess.Popen(cmd2, stdin=pcmd1.stdout,stdout=subprocess.PIPE,shell=False).communicate()[0])
+r1q = round(r1json["qc_stats"]["qual_mean"],1)
+r1l = round(r1json["qc_stats"]["read_mean"],1)
+
+# Read 2
+if str(args.type[0]) == "gz" :
+  cmd1 = ["zcat", r2]
+else :
+  cmd1 = ["cat", r2]
+cmd2 = ["fastq-scan"]
+pcmd1= subprocess.Popen(cmd1,stdout= subprocess.PIPE,shell=False)
+r2json = json.loads(subprocess.Popen(cmd2, stdin=pcmd1.stdout,stdout=subprocess.PIPE,shell=False).communicate()[0])
+r2q = round(r2json["qc_stats"]["qual_mean"],1)
+r2l = round(r2json["qc_stats"]["read_mean"],1)
+
+# Write output to be used by sum_mlst.py
+output = open(args.output[0],"w")
+output.write(str(r1l) + "\t" + str(r2l) + "\t" + str(r1q) + "\t" + str(r2q))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/microrunqc/sum_mlst.py	Fri Feb 02 15:16:46 2024 +0000
@@ -0,0 +1,79 @@
+#!/usr/bin/env
+
+## Generate basic summary stats from SKESA, fastq-scan, and MLST output.
+## author: errol strain, estrain@gmail.com
+
+from argparse import (ArgumentParser, FileType)
+import sys
+import glob 
+import subprocess
+from decimal import Decimal
+
+def parse_args():
+  "Parse the input arguments, use '-h' for help."
+
+  parser = ArgumentParser(description='Generate Basic Summary Statistics from SKESA assemblies, fastq-scan output, and MLST reports')
+
+  # Read inputs
+  parser.add_argument('--fasta', type=str, required=True, nargs=1, help='SKESA FASTA assembly')
+  parser.add_argument('--mlst', type=str, required=True, nargs=1, help='MLST output')
+  parser.add_argument('--fqscan', type=str, required=True, nargs=1, help='fastq-scan output')
+  parser.add_argument('--med', type=str, required=True, nargs=1, help='Median Insert Size')
+  parser.add_argument('--output', type=str, required=True, nargs=1, help='Output File')
+
+  return parser.parse_args()
+
+args =parse_args()
+
+# FASTA file
+fasta = args.fasta[0]
+
+# Get individual and total length of contigs
+cmd = ["awk", "/^>/ {if (seqlen){print seqlen}; ;seqlen=0;next; } { seqlen = seqlen +length($0)}END{print seqlen}",fasta]
+seqlen = subprocess.Popen(cmd,stdout= subprocess.PIPE).communicate()[0] 
+intlen = list(map(int,seqlen.splitlines()))
+totlen = sum(intlen)
+# Count number of contigs
+numtigs = len(intlen) 
+
+# Get coverage information from skesa fasta header
+cmd1 = ["grep",">",fasta] 
+cmd2 = ["cut","-f","3","-d","_"]
+p1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE)
+p2 = subprocess.Popen(cmd2, stdin=p1.stdout, stdout=subprocess.PIPE).communicate()[0]
+covdep = map(float,p2.splitlines())
+covlist = [a*b for a,b in zip([float(i) for i in intlen],covdep)]
+covdep = round(sum(covlist)/totlen,1)
+
+# Calculate N50
+vals = [int(i) for i in intlen]
+vals.sort(reverse=True)
+n50=0
+for counter in range(0,len(vals)-1):
+  if sum(vals[0:counter]) > (totlen/2):
+    n50=vals[counter-1]
+    break
+
+# Read in MLST output
+mlst = open(args.mlst[0],"r")
+profile = mlst.readline()
+els = profile.split("\t")
+
+# Read in median insert size
+medfile = open(args.med[0],"r")
+insert = medfile.readline()
+insert = insert.rstrip()
+
+# Read in fastq-scan
+fqfile = open(args.fqscan[0],"r")
+fq = fqfile.readline()
+fq = fq.rstrip()
+
+output = open(args.output[0],"w")
+
+filehead = str("File\tContigs\tLength\tEstCov\tN50\tMedianInsert\tMeanLength_R1\tMeanLength_R2\tMeanQ_R1\tMeanQ_R2\tScheme\tST\n")
+output.write(filehead)
+
+output.write(str(fasta) + "\t" + str(numtigs) + "\t" + str(totlen) + "\t" + str(covdep) + "\t" + str(n50) +"\t" + str(insert) + "\t" + str(fq))
+for counter in range(1,len(els)):
+  output.write("\t" + str(els[counter]))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/microrunqc/tool_data_table_conf.xml.sample	Fri Feb 02 15:16:46 2024 +0000
@@ -0,0 +1,7 @@
+<tables>
+    <!-- Locations of amrfinderplus database in the required format -->
+    <table name="mlst" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, name, path</columns>
+        <file path="tool-data/mlst.loc" />
+    </table>
+</tables>
--- a/mlst.loc.sample	Fri Jan 19 11:50:31 2024 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-# this is a tab separated file describing the location of mlst databases
-#
-# the columns are:
-# value name path
-#
-# for example
--- a/mlstAddFields.py	Fri Jan 19 11:50:31 2024 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,73 +0,0 @@
-#!/usr/bin/env 
-
-import sys
-import csv
-
-def find_index(headers, term):
-    try:
-        return headers.index(term)
-    except ValueError:
-        return -1
-
-def main(mlst_file, db_path=None):
-    with open(mlst_file, 'r') as file:
-        reader = csv.reader(file, delimiter='\t')
-        mlstout = next(reader)
-
-    schema = mlstout[1]
-    mlstST = mlstout[2]
-
-    if db_path is None:
-        # If no database path is provided, find it using an external command
-        # This requires the 'mlst' command to be installed and available in the path
-        import subprocess
-        mlstdesc = subprocess.check_output(['mlst', '-h']).decode()
-        db_pubmlst = [line for line in mlstdesc.split('\n') if 'db/pubmlst' in line]
-        if db_pubmlst:
-            mlstloc = db_pubmlst[0].split("'")[1].replace("bin/..", "")
-        else:
-            raise Exception("Could not find MLST database location.")
-    else:
-        mlstloc = db_path
-
-    mlst_file_path = f"{mlstloc}/{schema}/{schema}.txt"
-
-    schema_dict = {}
-    with open(mlst_file_path, 'r') as file:
-        reader = csv.reader(file, delimiter='\t')
-        headers = next(reader)
-
-        clonal = find_index(headers, 'clonal_complex')
-        cc = find_index(headers, 'CC')
-        lineage = find_index(headers, 'Lineage')
-        species = find_index(headers, 'species')
-
-        for line in reader:
-            desc = []
-            if clonal > -1 and line[clonal]:
-                desc.append(f"clonal_complex={line[clonal]}")
-            if cc > -1 and line[cc]:
-                desc.append(f"CC={line[cc]}")
-            if lineage > -1 and line[lineage]:
-                desc.append(f"Lineage={line[lineage]}")
-            if species > -1 and line[species]:
-                desc.append(f"species={line[species]}")
-            schema_dict[line[0]] = ','.join(desc)
-
-    output = mlstout[:3]
-    if mlstST in schema_dict:
-        output.append(schema_dict[mlstST])
-    output.extend(mlstout[3:])
-
-    print("\t".join(output))
-
-if __name__ == "__main__":
-    if len(sys.argv) < 2:
-        print("Usage: python mlstAddFields.py <mlst_file> [db_path]")
-        sys.exit(1)
-
-    mlst_file = sys.argv[1]
-    db_path = sys.argv[2] if len(sys.argv) > 2 else None
-
-    main(mlst_file, db_path)
-
--- a/run_fastq_scan.py	Fri Jan 19 11:50:31 2024 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,54 +0,0 @@
-#!/usr/bin/env
-
-## Run fastq-scan to get mean read length and mean quality score  
-## author: errol strain, estrain@gmail.com
-
-from argparse import (ArgumentParser, FileType)
-import sys
-import glob 
-import subprocess
-import json
-
-def parse_args():
-  "Parse the input arguments, use '-h' for help."
-
-  parser = ArgumentParser(description='Run fastq-scan on a pair of gzipped FASTQ files')
-
-  # Read inputs
-  parser.add_argument('--fastq', type=str, required=True, nargs=2, help='FASTQ files')
-  parser.add_argument('--output', type=str, required=True, nargs=1, help='Output File')
-  parser.add_argument('--type', type=str, required=True, nargs=1, help='File Type (text or gz)')
-
-  return parser.parse_args()
-
-args =parse_args()
-
-# FASTA file
-r1 = args.fastq[0]
-r2 = args.fastq[1]
-
-# Read 1 
-if str(args.type[0]) == "gz" :
-  cmd1 = ["zcat", r1]
-else :
-  cmd1 = ["cat", r1]
-cmd2 = ["fastq-scan"]
-pcmd1= subprocess.Popen(cmd1,stdout= subprocess.PIPE,shell=False)
-r1json = json.loads(subprocess.Popen(cmd2, stdin=pcmd1.stdout,stdout=subprocess.PIPE,shell=False).communicate()[0])
-r1q = round(r1json["qc_stats"]["qual_mean"],1)
-r1l = round(r1json["qc_stats"]["read_mean"],1)
-
-# Read 2
-if str(args.type[0]) == "gz" :
-  cmd1 = ["zcat", r2]
-else :
-  cmd1 = ["cat", r2]
-cmd2 = ["fastq-scan"]
-pcmd1= subprocess.Popen(cmd1,stdout= subprocess.PIPE,shell=False)
-r2json = json.loads(subprocess.Popen(cmd2, stdin=pcmd1.stdout,stdout=subprocess.PIPE,shell=False).communicate()[0])
-r2q = round(r2json["qc_stats"]["qual_mean"],1)
-r2l = round(r2json["qc_stats"]["read_mean"],1)
-
-# Write output to be used by sum_mlst.py
-output = open(args.output[0],"w")
-output.write(str(r1l) + "\t" + str(r2l) + "\t" + str(r1q) + "\t" + str(r2q))
--- a/sum_mlst.py	Fri Jan 19 11:50:31 2024 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,79 +0,0 @@
-#!/usr/bin/env
-
-## Generate basic summary stats from SKESA, fastq-scan, and MLST output.
-## author: errol strain, estrain@gmail.com
-
-from argparse import (ArgumentParser, FileType)
-import sys
-import glob 
-import subprocess
-from decimal import Decimal
-
-def parse_args():
-  "Parse the input arguments, use '-h' for help."
-
-  parser = ArgumentParser(description='Generate Basic Summary Statistics from SKESA assemblies, fastq-scan output, and MLST reports')
-
-  # Read inputs
-  parser.add_argument('--fasta', type=str, required=True, nargs=1, help='SKESA FASTA assembly')
-  parser.add_argument('--mlst', type=str, required=True, nargs=1, help='MLST output')
-  parser.add_argument('--fqscan', type=str, required=True, nargs=1, help='fastq-scan output')
-  parser.add_argument('--med', type=str, required=True, nargs=1, help='Median Insert Size')
-  parser.add_argument('--output', type=str, required=True, nargs=1, help='Output File')
-
-  return parser.parse_args()
-
-args =parse_args()
-
-# FASTA file
-fasta = args.fasta[0]
-
-# Get individual and total length of contigs
-cmd = ["awk", "/^>/ {if (seqlen){print seqlen}; ;seqlen=0;next; } { seqlen = seqlen +length($0)}END{print seqlen}",fasta]
-seqlen = subprocess.Popen(cmd,stdout= subprocess.PIPE).communicate()[0] 
-intlen = list(map(int,seqlen.splitlines()))
-totlen = sum(intlen)
-# Count number of contigs
-numtigs = len(intlen) 
-
-# Get coverage information from skesa fasta header
-cmd1 = ["grep",">",fasta] 
-cmd2 = ["cut","-f","3","-d","_"]
-p1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE)
-p2 = subprocess.Popen(cmd2, stdin=p1.stdout, stdout=subprocess.PIPE).communicate()[0]
-covdep = map(float,p2.splitlines())
-covlist = [a*b for a,b in zip([float(i) for i in intlen],covdep)]
-covdep = round(sum(covlist)/totlen,1)
-
-# Calculate N50
-vals = [int(i) for i in intlen]
-vals.sort(reverse=True)
-n50=0
-for counter in range(0,len(vals)-1):
-  if sum(vals[0:counter]) > (totlen/2):
-    n50=vals[counter-1]
-    break
-
-# Read in MLST output
-mlst = open(args.mlst[0],"r")
-profile = mlst.readline()
-els = profile.split("\t")
-
-# Read in median insert size
-medfile = open(args.med[0],"r")
-insert = medfile.readline()
-insert = insert.rstrip()
-
-# Read in fastq-scan
-fqfile = open(args.fqscan[0],"r")
-fq = fqfile.readline()
-fq = fq.rstrip()
-
-output = open(args.output[0],"w")
-
-filehead = str("File\tContigs\tLength\tEstCov\tN50\tMedianInsert\tMeanLength_R1\tMeanLength_R2\tMeanQ_R1\tMeanQ_R2\tScheme\tST\n")
-output.write(filehead)
-
-output.write(str(fasta) + "\t" + str(numtigs) + "\t" + str(totlen) + "\t" + str(covdep) + "\t" + str(n50) +"\t" + str(insert) + "\t" + str(fq))
-for counter in range(1,len(els)):
-  output.write("\t" + str(els[counter]))
--- a/tool_data_table_conf.xml.sample	Fri Jan 19 11:50:31 2024 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,7 +0,0 @@
-<tables>
-    <!-- Locations of amrfinderplus database in the required format -->
-    <table name="mlst" comment_char="#" allow_duplicate_entries="False">
-        <columns>value, name, path</columns>
-        <file path="tool-data/mlst.loc" />
-    </table>
-</tables>