changeset 14:95b1d1a9497d draft

Uploaded
author greg
date Fri, 17 Mar 2023 17:23:43 +0000
parents f03c80bb22e9
children 02283aa193c3
files pima_report.py pima_report.xml
diffstat 2 files changed, 52 insertions(+), 38 deletions(-) [+]
line wrap: on
line diff
--- a/pima_report.py	Thu Mar 16 14:42:13 2023 +0000
+++ b/pima_report.py	Fri Mar 17 17:23:43 2023 +0000
@@ -9,6 +9,8 @@
 from Bio import SeqIO
 from datetime import date
 from mdutils.mdutils import MdUtils
+# FIXME: TableOfContents doesn't work.
+# from mdutils.tools import TableOfContents
 
 CDC_ADVISORY = 'The analysis and report presented here should be treated as preliminary.  Please contact the CDC/BDRD with any results regarding _Bacillus anthracis_.'
 
@@ -19,10 +21,10 @@
                  assembly_name=None, bedtools_version=None, blastn_version=None, circos_files=None,
                  compute_sequence_length_file=None, contig_coverage_file=None, dbkey=None, dnadiff_snps_file=None,
                  dnadiff_version=None, feature_bed_files=None, feature_png_files=None, flye_assembly_info_file=None,
-                 flye_version=None, genome_insertions_file=None, gzipped=None, illumina_fastq_file=None,
-                 kraken2_report_file=None, kraken2_version=None, minimap2_version=None, mutation_regions_bed_file=None,
-                 mutation_regions_tsv_files=None, pima_css=None, plasmids_file=None, quast_report_file=None,
-                 reference_insertions_file=None, samtools_version=None, varscan_version=None):
+                 flye_version=None, genome_insertions_file=None, gzipped=None, kraken2_report_file=None,
+                 kraken2_version=None, minimap2_version=None, mutation_regions_bed_file=None, mutation_regions_tsv_files=None,
+                 ont_fastq_file=None, pima_css=None, plasmids_file=None, quast_report_file=None, reference_insertions_file=None,
+                 samtools_version=None, varscan_version=None):
         self.ofh = open("process_log.txt", "w")
 
         self.ofh.write("amr_deletions_file: %s\n" % str(amr_deletions_file))
@@ -44,12 +46,12 @@
         self.ofh.write("flye_version: %s\n" % str(flye_version))
         self.ofh.write("gzipped: %s\n" % str(gzipped))
         self.ofh.write("genome_insertions_file: %s\n" % str(genome_insertions_file))
-        self.ofh.write("illumina_fastq_file: %s\n" % str(illumina_fastq_file))
         self.ofh.write("kraken2_report_file: %s\n" % str(kraken2_report_file))
         self.ofh.write("kraken2_version: %s\n" % str(kraken2_version))
         self.ofh.write("minimap2_version: %s\n" % str(minimap2_version))
         self.ofh.write("mutation_regions_bed_file: %s\n" % str(mutation_regions_bed_file))
         self.ofh.write("mutation_regions_tsv_files: %s\n" % str(mutation_regions_tsv_files))
+        self.ofh.write("ont_fastq_file: %s\n" % str(ont_fastq_file))
         self.ofh.write("pima_css: %s\n" % str(pima_css))
         self.ofh.write("plasmids_file: %s\n" % str(plasmids_file))
         self.ofh.write("quast_report_file: %s\n" % str(quast_report_file))
@@ -94,7 +96,6 @@
             self.flye_version = re.sub('_', '.', flye_version.rstrip(' _assembly info_'))
         self.gzipped = gzipped
         self.genome_insertions_file = genome_insertions_file
-        self.illumina_fastq_file = illumina_fastq_file
         self.kraken2_report_file = kraken2_report_file
         if kraken2_version is None:
             self.kraken2_version = 'kraken2 (version unknown)'
@@ -106,13 +107,10 @@
             self.minimap2_version = re.sub('_', '.', minimap2_version)
         self.mutation_regions_bed_file = mutation_regions_bed_file
         self.mutation_regions_tsv_files = mutation_regions_tsv_files
-        self.read_type = 'Illumina'
-        self.ont_bases = None
-        self.ont_n50 = None
-        self.ont_read_count = None
         self.pima_css = pima_css
         self.plasmids_file = plasmids_file
         self.quast_report_file = quast_report_file
+        self.read_type = 'ONT'
         self.reference_insertions_file = reference_insertions_file
         self.reference_insertions_file = reference_insertions_file
         if samtools_version is None:
@@ -166,16 +164,25 @@
         self.contig_info = None
         self.did_medaka_ont_assembly = False
         self.feature_hits = pandas.Series(dtype='float64')
-        self.illumina_length_mean = 0
-        self.illumina_read_count = 0
-        self.illumina_bases = 0
-        # TODO: should the following 2 values  be passed as  parameters?
+        self.illumina_fastq_file = None
+        self.illumina_length_mean = None
+        self.illumina_read_count = None
+        self.illumina_bases = None
+        self.ont_bases = None
+        # TODO: should the following be passed as a parameter?
+        self.ont_coverage_min = 30
+        self.ont_fast5 = None
+        self.ont_fastq_file = ont_fastq_file
+        self.ont_n50 = None
+        # TODO: should the following be passed as a parameter?
         self.ont_n50_min = 2500
-        self.ont_coverage_min = 30
+        self.ont_raw_fastq = self.analysis_name
+        self.ont_read_count = None
 
         # Actions
         self.did_guppy_ont_fast5 = False
         self.did_qcat_ont_fastq = False
+        self.info_ont_fastq(self.ont_fastq_file)
         self.info_illumina_fastq()
         self.load_contig_info()
 
@@ -237,12 +244,14 @@
 
     def info_illumina_fastq(self):
         self.ofh.write("\nXXXXXX In info_illumina_fastq\n\n")
+        if self.illumina_length_mean is None:
+            return
         if self.gzipped:
             opener = 'gunzip -c'
         else:
             opener = 'cat'
         command = ' '.join([opener,
-                            self.illumina_fastq_file,
+                            self.ont_fastq_file,
                             '| awk \'{getline;s += length($1);getline;getline;}END{print s/(NR/4)"\t"(NR/4)"\t"s}\''])
         output = self.run_command(command)
         self.ofh.write("output:\n%s\n" % str(output))
@@ -260,9 +269,9 @@
         self.illumina_read_count += int(values[1])
         self.ofh.write("values[2]:\n%s\n" % str(values[2]))
         self.illumina_bases += int(values[2])
-        # The original PIMA code inserts self.illumina_fastq into
+        # The original PIMA code inserts self.illumina_fastq_file into
         # a list for no apparent reason.  We don't do that here.
-        # self.illumina_length_mean /= len(self.illumina_fastq)
+        # self.illumina_length_mean /= len(self.illumina_fastq_file)
         self.illumina_length_mean /= 1
         self.illumina_bases = self.format_kmg(self.illumina_bases, decimals=1)
 
@@ -270,6 +279,12 @@
         header_text = 'Analysis of ' + self.analysis_name
         self.doc = MdUtils(file_name=self.report_md, title=header_text)
 
+    def add_table_of_contents(self):
+        self.doc.create_marker(text_marker="TableOfContents")
+        self.doc.new_line()
+        self.doc.new_line('<div style="page-break-after: always;"></div>')
+        self.doc.new_line()
+
     def add_run_information(self):
         self.ofh.write("\nXXXXXX In add_run_information\n\n")
         self.doc.new_line()
@@ -281,11 +296,11 @@
             "Date",
             date.today(),
             "ONT FAST5",
-            "N/A",
+            self.wordwrap_markdown(self.ont_fast5),
             "ONT FASTQ",
-            "N/A",
+            self.wordwrap_markdown(self.ont_raw_fastq),
             "Illumina FASTQ",
-            self.wordwrap_markdown(self.analysis_name),
+            self.wordwrap_markdown(self.illumina_fastq_file),
             "Assembly",
             self.wordwrap_markdown(self.assembly_name),
             "Reference",
@@ -293,6 +308,8 @@
         ]
         self.doc.new_table(columns=2, rows=7, text=Table_list, text_align='left')
         self.doc.new_line()
+        # FIXME: the following doesn't work.
+        # self.add_table_of_contents()
         self.doc.new_line()
 
     def add_ont_library_information(self):
@@ -311,7 +328,7 @@
             "ONT bases",
             '{:s}'.format(self.ont_bases),
             "Illumina FASTQ",
-            self.wordwrap_markdown(self.illumina_fastq_file),
+            self.wordwrap_markdown(self.ont_fastq_file),
             "Assembly",
             self.wordwrap_markdown(self.assembly_name),
             "Reference",
@@ -322,7 +339,7 @@
 
     def add_illumina_library_information(self):
         self.ofh.write("\nXXXXXX In add_illumina_library_information\n\n")
-        if self.illumina_length_mean == 0:
+        if self.illumina_length_mean is None:
             return
         self.doc.new_line()
         self.doc.new_header(2, 'Illumina library statistics')
@@ -386,17 +403,15 @@
         result = list(re.split('\\t', self.run_command(command)[0]))
         if result[1] == '0':
             self.error_out('No ONT reads found')
-        ont_n50, ont_read_count, ont_raw_bases = [int(i) for i in result]
+        self.ont_n50, self.ont_read_count, ont_raw_bases = [int(i) for i in result]
         command = ' '.join([opener,
                             fastq_file,
                             '| awk \'{getline;print length($0);getline;getline;}\''])
         result = self.run_command(command)
         result = list(filter(lambda x: x != '', result))
-        # TODO: the following are not yet used...
-        # ont_read_lengths = [int(i) for i in result]
-        # ont_bases = self.format_kmg(ont_raw_bases, decimals=1)
-        if ont_n50 <= self.ont_n50_min:
-            warning = 'ONT N50 (%s) is less than the recommended minimum (%s)' % (str(ont_n50), str(self.ont_n50_min))
+        self.ont_bases = self.format_kmg(ont_raw_bases, decimals=1)
+        if self.ont_n50 <= self.ont_n50_min:
+            warning = 'ONT N50 (%s) is less than the recommended minimum (%s)' % (str(self.ont_n50), str(self.ont_n50_min))
             self.assembly_notes = self.assembly_notes.append(pandas.Series(warning))
 
     def wordwrap_markdown(self, string):
@@ -767,7 +782,7 @@
         if self.did_qcat_ont_fastq:
             methods += ['ONT reads were demultiplexed and trimmed using qcat']
         self.methods[self.basecalling_methods_title] = pandas.Series(methods)
-        self.add_illumina_library_information()
+        # self.add_illumina_library_information()
         self.add_contig_info()
         self.evaluate_assembly()
         self.add_assembly_information()
@@ -785,7 +800,6 @@
         if self.did_medaka_ont_assembly:
             method = 'the genome assembly was polished using ont reads and medaka.'
             self.methods[self.assembly_methods_title] = self.methods[self.assembly_methods_title].append(pandas.series(method))
-        self.info_ont_fastq(self.illumina_fastq_file)
         self.add_assembly_notes()
 
     def make_tex(self):
@@ -844,7 +858,7 @@
 parser.add_argument('--flye_version', action='store', dest='flye_version', default=None, help='Flye version string')
 parser.add_argument('--genome_insertions_file', action='store', dest='genome_insertions_file', help='Genome insertions BED file')
 parser.add_argument('--gzipped', action='store_true', dest='gzipped', default=False, help='Input sample is gzipped')
-parser.add_argument('--illumina_fastq_file', action='store', dest='illumina_fastq_file', help='Input sample')
+parser.add_argument('--ont_fastq_file', action='store', dest='ont_fastq_file', help='Input sample')
 parser.add_argument('--kraken2_report_file', action='store', dest='kraken2_report_file', default=None, help='kraken2 report file')
 parser.add_argument('--kraken2_version', action='store', dest='kraken2_version', default=None, help='kraken2 version string')
 parser.add_argument('--minimap2_version', action='store', dest='minimap2_version', default=None, help='minimap2 version string')
@@ -904,12 +918,12 @@
                              args.flye_version,
                              args.genome_insertions_file,
                              args.gzipped,
-                             args.illumina_fastq_file,
                              args.kraken2_report_file,
                              args.kraken2_version,
                              args.minimap2_version,
                              args.mutation_regions_bed_file,
                              mutation_regions_files,
+                             args.ont_fastq_file,
                              args.pima_css,
                              args.plasmids_file,
                              args.quast_report_file,
--- a/pima_report.xml	Thu Mar 16 14:42:13 2023 +0000
+++ b/pima_report.xml	Fri Mar 17 17:23:43 2023 +0000
@@ -7,7 +7,7 @@
     <command detect_errors="exit_code"><![CDATA[
 #import re
 
-#set analysis_name = re.sub('[^\s\w\-]', '_', str($illumina_fastq_file.element_identifier))
+#set analysis_name = re.sub('[^\s\w\-]', '_', str($ont_fastq_file.element_identifier))
 #set assembly_name = re.sub('[^\s\w\-]', '_', str($assembly_fasta_file.element_identifier))
 
 #if str($bedtools_complementbed_file) not in ['None', '']:
@@ -95,10 +95,10 @@
     --flye_version '$flye_version'
 #end if
 --genome_insertions_file '$genome_insertions_file'
-#if $illumina_fastq_file.ext.endswith(".gz"):
+#if $ont_fastq_file.ext.endswith(".gz"):
     --gzipped
 #end if
---illumina_fastq_file '$illumina_fastq_file'
+--ont_fastq_file '$ont_fastq_file'
 #if str($kraken2_report_file) not in ['None', '']:
     --kraken2_report_file '$kraken2_report_file'
     --kraken2_version '$kraken2_version'
@@ -136,7 +136,7 @@
         <param name="flye_assembly_info_file" type="data" format="tabular,tsv" optional="true" label="Flye assembly info tabular file" help="Optional, ignored if not selected"/>
         <param name="genome_insertions_file" type="data" format="bed" label="Genome insertions BED file"/>
         <param name="kraken2_report_file" type="data" format="tabular,tsv" optional="true" label="Kraken2 report tabular file" help="Optional, ignored if not selected"/>
-        <param name="illumina_fastq_file" type="data" format="fastqsanger,fastqsanger.gz" label="Fastq sample file"/>
+        <param name="ont_fastq_file" type="data" format="fastqsanger,fastqsanger.gz" label="ONT fastq sample file"/>
         <param name="minimap2_bam_file" type="data" format="bam" label="Minimap2 BAM file"/>
         <param name="mutation_regions" format="tabular,tsv" type="data_collection" collection_type="list" label="Collection of mutation regions tabular files"/>
         <param name="mutation_regions_bed_file" type="data" format="mutations_regions,bed" label="Mutation regions BED file"/>
@@ -154,7 +154,7 @@
             <param name="aligned_sample" value="aligned_sample.bam" ftype="bam"/>
             <param name="assembly_fasta_file" value="assembly_fasta.fasta" ftype="fasta"/>
             <param name="contig_coverage_file" value="contig_coverage.tabular" ftype="tabular"/>
-            <param name="illumina_fastq_file" value="illumina_fastq.fastq" ftype="fastq"/>
+            <param name="ont_fastq_file" value="ont_fastq.fastq" ftype="fastq"/>
             <output name="output" value="output.pdf" ftype="pdf"/>
         </test>
     </tests>