diff pima_report.py @ 13:f03c80bb22e9 draft

Uploaded
author greg
date Thu, 16 Mar 2023 14:42:13 +0000
parents 99613333fd1f
children 95b1d1a9497d
line wrap: on
line diff
--- a/pima_report.py	Fri Mar 10 16:35:16 2023 +0000
+++ b/pima_report.py	Thu Mar 16 14:42:13 2023 +0000
@@ -16,13 +16,13 @@
 class PimaReport:
 
     def __init__(self, analysis_name=None, amr_deletions_file=None, amr_matrix_files=None, assembly_fasta_file=None,
-                 assembly_name=None, bedtools_version=None, blastn_version=None, compute_sequence_length_file=None,
-                 contig_coverage_file=None, dbkey=None, dnadiff_snps_file=None, dnadiff_version=None,
-                 feature_bed_files=None, feature_png_files=None, flye_assembly_info_file=None, flye_version=None,
-                 genome_insertions_file=None, gzipped=None, illumina_fastq_file=None, kraken2_report_file=None,
-                 kraken2_version=None, minimap2_version=None, mutation_regions_bed_file=None,
-                 mutation_regions_tsv_files=None, pima_css=None, plasmids_file=None, reference_insertions_file=None,
-                 samtools_version=None, varscan_version=None):
+                 assembly_name=None, bedtools_version=None, blastn_version=None, circos_files=None,
+                 compute_sequence_length_file=None, contig_coverage_file=None, dbkey=None, dnadiff_snps_file=None,
+                 dnadiff_version=None, feature_bed_files=None, feature_png_files=None, flye_assembly_info_file=None,
+                 flye_version=None, genome_insertions_file=None, gzipped=None, illumina_fastq_file=None,
+                 kraken2_report_file=None, kraken2_version=None, minimap2_version=None, mutation_regions_bed_file=None,
+                 mutation_regions_tsv_files=None, pima_css=None, plasmids_file=None, quast_report_file=None,
+                 reference_insertions_file=None, samtools_version=None, varscan_version=None):
         self.ofh = open("process_log.txt", "w")
 
         self.ofh.write("amr_deletions_file: %s\n" % str(amr_deletions_file))
@@ -32,6 +32,7 @@
         self.ofh.write("assembly_name: %s\n" % str(assembly_name))
         self.ofh.write("bedtools_version: %s\n" % str(bedtools_version))
         self.ofh.write("blastn_version: %s\n" % str(blastn_version))
+        self.ofh.write("circos_files: %s\n" % str(circos_files))
         self.ofh.write("compute_sequence_length_file: %s\n" % str(compute_sequence_length_file))
         self.ofh.write("contig_coverage_file: %s\n" % str(contig_coverage_file))
         self.ofh.write("dbkey: %s\n" % str(dbkey))
@@ -51,6 +52,7 @@
         self.ofh.write("mutation_regions_tsv_files: %s\n" % str(mutation_regions_tsv_files))
         self.ofh.write("pima_css: %s\n" % str(pima_css))
         self.ofh.write("plasmids_file: %s\n" % str(plasmids_file))
+        self.ofh.write("quast_report_file: %s\n" % str(quast_report_file))
         self.ofh.write("reference_insertions_file: %s\n" % str(reference_insertions_file))
         self.ofh.write("samtools_version: %s\n" % str(samtools_version))
         self.ofh.write("varscan_version: %s\n" % str(varscan_version))
@@ -62,7 +64,8 @@
         # Inputs
         self.amr_deletions_file = amr_deletions_file
         self.amr_matrix_files = amr_matrix_files
-        self.analysis_name = re.sub('_', '.', analysis_name.rstrip(' _consensus_'))
+        self.analysis_name = analysis_name.split('_')[0]
+        self.ofh.write("self.analysis_name: %s\n" % str(self.analysis_name))
         self.assembly_fasta_file = assembly_fasta_file
         self.assembly_name = re.sub('_', '.', assembly_name.rstrip(' _consensus_'))
         if bedtools_version is None:
@@ -73,6 +76,7 @@
             self.blastn_version = 'blastn (version unknown)'
         else:
             self.blastn_version = re.sub('_', '.', blastn_version.rstrip(' _features_'))
+        self.circos_files = circos_files
         self.compute_sequence_length_file = compute_sequence_length_file
         self.contig_coverage_file = contig_coverage_file
         self.dbkey = dbkey
@@ -108,6 +112,8 @@
         self.ont_read_count = None
         self.pima_css = pima_css
         self.plasmids_file = plasmids_file
+        self.quast_report_file = quast_report_file
+        self.reference_insertions_file = reference_insertions_file
         self.reference_insertions_file = reference_insertions_file
         if samtools_version is None:
             self.samtools_version = 'samtools (version unknown)'
@@ -139,7 +145,7 @@
         self.plasmid_title = 'Plasmid annotation'
         self.reference_methods_title = 'Reference comparison'
         self.snp_indel_title = 'SNPs and small indels'
-        self.summary_title = 'Analysis of %s' % analysis_name
+        self.summary_title = 'Summary'
 
         # Methods
         self.methods = pandas.Series(dtype='float64')
@@ -163,13 +169,9 @@
         self.illumina_length_mean = 0
         self.illumina_read_count = 0
         self.illumina_bases = 0
-        self.mean_coverage = 0
-        self.num_assembly_contigs = 0
         # TODO: should the following 2 values  be passed as  parameters?
         self.ont_n50_min = 2500
         self.ont_coverage_min = 30
-        self.quast_indels = 0
-        self.quast_mismatches = 0
 
         # Actions
         self.did_guppy_ont_fast5 = False
@@ -265,7 +267,8 @@
         self.illumina_bases = self.format_kmg(self.illumina_bases, decimals=1)
 
     def start_doc(self):
-        self.doc = MdUtils(file_name=self.report_md, title='')
+        header_text = 'Analysis of ' + self.analysis_name
+        self.doc = MdUtils(file_name=self.report_md, title=header_text)
 
     def add_run_information(self):
         self.ofh.write("\nXXXXXX In add_run_information\n\n")
@@ -319,7 +322,7 @@
 
     def add_illumina_library_information(self):
         self.ofh.write("\nXXXXXX In add_illumina_library_information\n\n")
-        if self.illumina_length_mean is None:
+        if self.illumina_length_mean == 0:
             return
         self.doc.new_line()
         self.doc.new_header(2, 'Illumina library statistics')
@@ -475,38 +478,41 @@
 
     def add_alignment(self):
         self.ofh.write("\nXXXXXX In add_alignment\n\n")
-        # TODO: implement the draw_circos function for this.
-        if len(self.contig_alignment) > 0:
-            alignments = self.contig_alignment
-        else:
-            return
-        self.doc.new_line()
-        self.doc.new_header(level=2, title=self.alignment_title)
-        self.doc.new_line()
-        self.doc.new_header(level=3, title=self.snp_indel_title)
-        Table_1 = [
-            "Category",
-            "Quantity",
-            'SNPs',
-            '{:,}'.format(self.quast_mismatches),
-            'Small indels',
-            '{:,}'.format(self.quast_indels)
-        ]
+        if self.quast_report_file is not None:
+            # Process quast values.
+            quast_report = pandas.read_csv(self.quast_report_file, header=0, index_col=0, sep='\t')
+            quast_mismatches = int(float(quast_report.loc['# mismatches per 100 kbp', :][0]) * (float(quast_report.loc['Total length (>= 0 bp)', :][0]) / 100000.))
+            quast_indels = int(float(quast_report.loc['# indels per 100 kbp', :][0]) * (float(quast_report.loc['Total length (>= 0 bp)', :][0]) / 100000.))
+            self.doc.new_line()
+            self.doc.new_header(level=2, title=self.alignment_title)
+            self.doc.new_line()
+            self.doc.new_header(level=3, title=self.snp_indel_title)
+            Table_1 = [
+                "Category",
+                "Quantity",
+                'SNPs',
+                '{:,}'.format(quast_mismatches),
+                'Small indels',
+                '{:,}'.format(quast_indels)
+            ]
         self.doc.new_table(columns=2, rows=3, text=Table_1, text_align='left')
         self.doc.new_line('<div style="page-break-after: always;"></div>')
         self.doc.new_line()
+        # TODO: self.alignment_notes is not currently populated.
         if len(self.alignment_notes) > 0:
             self.doc.new_header(level=3, title=self.alignment_notes_title)
             for note in self.alignment_notes:
                 self.doc.new_line(note)
-        for contig in alignments.index.tolist():
-            contig_title = 'Alignment to %s' % contig
-            image_png = alignments[contig]
-            self.doc.new_line()
-            self.doc.new_header(level=3, title=contig_title)
-            self.doc.new_line(self.doc.new_inline_image(text='contig_title', path=os.path.abspath(image_png)))
-            self.doc.new_line('<div style="page-break-after: always;"></div>')
-            self.doc.new_line()
+        if len(self.circos_files) > 0:
+            # Add circos PNG files.
+            for circos_file in self.circos_files:
+                contig = os.path.basename(circos_file)
+                contig_title = 'Alignment to %s' % contig
+                self.doc.new_line()
+                self.doc.new_header(level=3, title=contig_title)
+                self.doc.new_line(self.doc.new_inline_image(text='contig_title', path=os.path.abspath(circos_file)))
+                self.doc.new_line('<div style="page-break-after: always;"></div>')
+                self.doc.new_line()
         method = 'The genome assembly was aligned against the reference sequencing using dnadiff version %s.' % self.dnadiff_version
         self.methods[self.reference_methods_title] = self.methods[self.reference_methods_title].append(pandas.Series(method))
 
@@ -643,7 +649,7 @@
                 self.doc.new_table(columns=6, rows=row_count, text=Table_List, text_align='left')
         method = '%s reads were mapped to the reference sequence using %s.' % (self.read_type, self.minimap2_version)
         self.methods[self.mutation_methods_title] = self.methods[self.mutation_methods_title].append(pandas.Series(method))
-        method = 'Mutations were identified using %s mpileup and %s.' % (self.samtools_version, self.varscan_version)
+        method = 'Mutations were identified using %s and %s.' % (self.samtools_version, self.varscan_version)
         self.methods[self.mutation_methods_title] = self.methods[self.mutation_methods_title].append(pandas.Series(method))
 
     def add_amr_matrix(self):
@@ -672,15 +678,6 @@
             genome_insertions = pandas.DataFrame()
         large_indels['Reference insertions'] = reference_insertions
         large_indels['Query insertions'] = genome_insertions
-        # TODO: we don't seem to be reporting snps and deletions for some reason...
-        # Pull in the number of SNPs and small indels.
-        try:
-            snps = pandas.read_csv(filepath_or_buffer=self.dnadiff_snps_file, sep='\t', header=None)
-            # TODO: the following is not used...
-            # small_indels = snps.loc[(snps.iloc[:, 1] == '.') | (snps.iloc[:, 2] == '.'), :]
-            snps = snps.loc[(snps.iloc[:, 1] != '.') & (snps.iloc[:, 2] != '.'), :]
-        except Exception:
-            snps = pandas.DataFrame()
         # Pull in deletions.
         try:
             amr_deletions = pandas.read_csv(filepath_or_buffer=self.amr_deletion_file, sep='\t', header=None)
@@ -835,6 +832,7 @@
 parser.add_argument('--assembly_name', action='store', dest='assembly_name', help='Assembly identifier')
 parser.add_argument('--bedtools_version', action='store', dest='bedtools_version', default=None, help='Bedtools version string')
 parser.add_argument('--blastn_version', action='store', dest='blastn_version', default=None, help='Blastn version string')
+parser.add_argument('--circos_png_dir', action='store', dest='circos_png_dir', help='Directory of circos PNG files')
 parser.add_argument('--compute_sequence_length_file', action='store', dest='compute_sequence_length_file', help='Comnpute sequence length tabular file')
 parser.add_argument('--contig_coverage_file', action='store', dest='contig_coverage_file', help='Contig coverage TSV file')
 parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference genome identifier')
@@ -854,6 +852,7 @@
 parser.add_argument('--mutation_regions_dir', action='store', dest='mutation_regions_dir', help='Directory of mutation regions TSV files')
 parser.add_argument('--pima_css', action='store', dest='pima_css', help='PIMA css stypesheet')
 parser.add_argument('--plasmids_file', action='store', dest='plasmids_file', help='pChunks plasmids TSV file')
+parser.add_argument('--quast_report_file', action='store', dest='quast_report_file', help='Quast report tabular file')
 parser.add_argument('--reference_insertions_file', action='store', dest='reference_insertions_file', help='Reference insertions BED file')
 parser.add_argument('--samtools_version', action='store', dest='samtools_version', default=None, help='Samtools version string')
 parser.add_argument('--varscan_version', action='store', dest='varscan_version', default=None, help='Varscan version string')
@@ -865,6 +864,11 @@
 for file_name in sorted(os.listdir(args.amr_matrix_png_dir)):
     file_path = os.path.abspath(os.path.join(args.amr_matrix_png_dir, file_name))
     amr_matrix_files.append(file_path)
+# Prepare the circos PNG files.
+circos_files = []
+for file_name in sorted(os.listdir(args.circos_png_dir)):
+    file_path = os.path.abspath(os.path.join(args.circos_png_dir, file_name))
+    circos_files.append(file_path)
 # Prepare the features BED files.
 feature_bed_files = []
 for file_name in sorted(os.listdir(args.feature_bed_dir)):
@@ -888,6 +892,7 @@
                              args.assembly_name,
                              args.bedtools_version,
                              args.blastn_version,
+                             circos_files,
                              args.compute_sequence_length_file,
                              args.contig_coverage_file,
                              args.dbkey,
@@ -907,6 +912,7 @@
                              mutation_regions_files,
                              args.pima_css,
                              args.plasmids_file,
+                             args.quast_report_file,
                              args.reference_insertions_file,
                              args.samtools_version,
                              args.varscan_version)