comparison pima_report.py @ 18:e948214a9e3c draft

Uploaded
author greg
date Wed, 22 Mar 2023 13:07:22 +0000
parents b4ed9f55de13
children c509e6819795
comparison
equal deleted inserted replaced
17:b4ed9f55de13 18:e948214a9e3c
18 class PimaReport: 18 class PimaReport:
19 19
20 def __init__(self, analysis_name=None, amr_deletions_file=None, amr_matrix_files=None, assembly_fasta_file=None, 20 def __init__(self, analysis_name=None, amr_deletions_file=None, amr_matrix_files=None, assembly_fasta_file=None,
21 assembly_name=None, bedtools_version=None, blastn_version=None, circos_files=None, 21 assembly_name=None, bedtools_version=None, blastn_version=None, circos_files=None,
22 compute_sequence_length_file=None, contig_coverage_file=None, dbkey=None, dnadiff_snps_file=None, 22 compute_sequence_length_file=None, contig_coverage_file=None, dbkey=None, dnadiff_snps_file=None,
23 dnadiff_version=None, feature_bed_files=None, feature_png_files=None, flye_assembly_info_file=None, 23 dnadiff_version=None, errors_file=None, feature_bed_files=None, feature_png_files=None,
24 flye_version=None, genome_insertions_file=None, gzipped=None, kraken2_report_file=None, 24 flye_assembly_info_file=None, flye_version=None, genome_insertions_file=None, gzipped=None,
25 kraken2_version=None, minimap2_version=None, mutation_regions_bed_file=None, mutation_regions_tsv_files=None, 25 kraken2_report_file=None, kraken2_version=None, minimap2_version=None, mutation_regions_bed_file=None,
26 ont_fastq_file=None, pima_css=None, plasmids_file=None, quast_report_file=None, reference_insertions_file=None, 26 mutation_regions_tsv_files=None, ont_fastq_file=None, pima_css=None, plasmids_file=None,
27 samtools_version=None, varscan_version=None): 27 quast_report_file=None, read_type=None, reference_insertions_file=None, samtools_version=None,
28 varscan_version=None):
28 self.ofh = open("process_log.txt", "w") 29 self.ofh = open("process_log.txt", "w")
29 30
30 self.ofh.write("amr_deletions_file: %s\n" % str(amr_deletions_file)) 31 self.ofh.write("amr_deletions_file: %s\n" % str(amr_deletions_file))
31 self.ofh.write("amr_matrix_files: %s\n" % str(amr_matrix_files)) 32 self.ofh.write("amr_matrix_files: %s\n" % str(amr_matrix_files))
32 self.ofh.write("analysis_name: %s\n" % str(analysis_name)) 33 self.ofh.write("analysis_name: %s\n" % str(analysis_name))
38 self.ofh.write("compute_sequence_length_file: %s\n" % str(compute_sequence_length_file)) 39 self.ofh.write("compute_sequence_length_file: %s\n" % str(compute_sequence_length_file))
39 self.ofh.write("contig_coverage_file: %s\n" % str(contig_coverage_file)) 40 self.ofh.write("contig_coverage_file: %s\n" % str(contig_coverage_file))
40 self.ofh.write("dbkey: %s\n" % str(dbkey)) 41 self.ofh.write("dbkey: %s\n" % str(dbkey))
41 self.ofh.write("dnadiff_snps_file: %s\n" % str(dnadiff_snps_file)) 42 self.ofh.write("dnadiff_snps_file: %s\n" % str(dnadiff_snps_file))
42 self.ofh.write("dnadiff_version: %s\n" % str(dnadiff_version)) 43 self.ofh.write("dnadiff_version: %s\n" % str(dnadiff_version))
44 self.ofh.write("errors_file: %s\n" % str(errors_file))
43 self.ofh.write("feature_bed_files: %s\n" % str(feature_bed_files)) 45 self.ofh.write("feature_bed_files: %s\n" % str(feature_bed_files))
44 self.ofh.write("feature_png_files: %s\n" % str(feature_png_files)) 46 self.ofh.write("feature_png_files: %s\n" % str(feature_png_files))
45 self.ofh.write("flye_assembly_info_file: %s\n" % str(flye_assembly_info_file)) 47 self.ofh.write("flye_assembly_info_file: %s\n" % str(flye_assembly_info_file))
46 self.ofh.write("flye_version: %s\n" % str(flye_version)) 48 self.ofh.write("flye_version: %s\n" % str(flye_version))
47 self.ofh.write("gzipped: %s\n" % str(gzipped)) 49 self.ofh.write("gzipped: %s\n" % str(gzipped))
53 self.ofh.write("mutation_regions_tsv_files: %s\n" % str(mutation_regions_tsv_files)) 55 self.ofh.write("mutation_regions_tsv_files: %s\n" % str(mutation_regions_tsv_files))
54 self.ofh.write("ont_fastq_file: %s\n" % str(ont_fastq_file)) 56 self.ofh.write("ont_fastq_file: %s\n" % str(ont_fastq_file))
55 self.ofh.write("pima_css: %s\n" % str(pima_css)) 57 self.ofh.write("pima_css: %s\n" % str(pima_css))
56 self.ofh.write("plasmids_file: %s\n" % str(plasmids_file)) 58 self.ofh.write("plasmids_file: %s\n" % str(plasmids_file))
57 self.ofh.write("quast_report_file: %s\n" % str(quast_report_file)) 59 self.ofh.write("quast_report_file: %s\n" % str(quast_report_file))
60 self.ofh.write("read_type: %s\n" % str(read_type))
58 self.ofh.write("reference_insertions_file: %s\n" % str(reference_insertions_file)) 61 self.ofh.write("reference_insertions_file: %s\n" % str(reference_insertions_file))
59 self.ofh.write("samtools_version: %s\n" % str(samtools_version)) 62 self.ofh.write("samtools_version: %s\n" % str(samtools_version))
60 self.ofh.write("varscan_version: %s\n" % str(varscan_version)) 63 self.ofh.write("varscan_version: %s\n" % str(varscan_version))
61 64
62 # General 65 # General
85 self.dnadiff_snps_file = dnadiff_snps_file 88 self.dnadiff_snps_file = dnadiff_snps_file
86 if dnadiff_version is None: 89 if dnadiff_version is None:
87 self.dnadiff_version = 'dnadiff (version unknown)' 90 self.dnadiff_version = 'dnadiff (version unknown)'
88 else: 91 else:
89 self.dnadiff_version = re.sub('_', '.', dnadiff_version.rstrip(' _snps_')) 92 self.dnadiff_version = re.sub('_', '.', dnadiff_version.rstrip(' _snps_'))
93 self.errors_file = errors_file
90 self.feature_bed_files = feature_bed_files 94 self.feature_bed_files = feature_bed_files
91 self.feature_png_files = feature_png_files 95 self.feature_png_files = feature_png_files
92 self.flye_assembly_info_file = flye_assembly_info_file 96 self.flye_assembly_info_file = flye_assembly_info_file
93 if flye_version is None: 97 if flye_version is None:
94 self.flye_version = 'flye (version unknown)' 98 self.flye_version = 'flye (version unknown)'
108 self.mutation_regions_bed_file = mutation_regions_bed_file 112 self.mutation_regions_bed_file = mutation_regions_bed_file
109 self.mutation_regions_tsv_files = mutation_regions_tsv_files 113 self.mutation_regions_tsv_files = mutation_regions_tsv_files
110 self.pima_css = pima_css 114 self.pima_css = pima_css
111 self.plasmids_file = plasmids_file 115 self.plasmids_file = plasmids_file
112 self.quast_report_file = quast_report_file 116 self.quast_report_file = quast_report_file
113 self.read_type = 'ONT' 117 self.read_type = read_type.upper()
114 self.reference_insertions_file = reference_insertions_file 118 self.reference_insertions_file = reference_insertions_file
115 self.reference_insertions_file = reference_insertions_file 119 self.reference_insertions_file = reference_insertions_file
116 if samtools_version is None: 120 if samtools_version is None:
117 self.samtools_version = 'samtools (version unknown)' 121 self.samtools_version = 'samtools (version unknown)'
118 else: 122 else:
135 self.feature_title = 'Features found in the assembly' 139 self.feature_title = 'Features found in the assembly'
136 self.feature_methods_title = 'Feature annotation' 140 self.feature_methods_title = 'Feature annotation'
137 self.feature_plot_title = 'Feature annotation plots' 141 self.feature_plot_title = 'Feature annotation plots'
138 self.large_indel_title = 'Large insertions & deletions' 142 self.large_indel_title = 'Large insertions & deletions'
139 self.methods_title = 'Methods' 143 self.methods_title = 'Methods'
144 self.mutation_errors_title = 'Errors finding mutations in the sample'
140 self.mutation_title = 'Mutations found in the sample' 145 self.mutation_title = 'Mutations found in the sample'
141 self.mutation_methods_title = 'Mutation screening' 146 self.mutation_methods_title = 'Mutation screening'
142 self.plasmid_methods_title = 'Plasmid annotation' 147 self.plasmid_methods_title = 'Plasmid annotation'
143 self.plasmid_title = 'Plasmid annotation' 148 self.plasmid_title = 'Plasmid annotation'
144 self.reference_methods_title = 'Reference comparison' 149 self.reference_methods_title = 'Reference comparison'
597 try: 602 try:
598 mutation_regions = pandas.read_csv(self.mutation_regions_bed_file, sep='\t', header=0, index_col=False) 603 mutation_regions = pandas.read_csv(self.mutation_regions_bed_file, sep='\t', header=0, index_col=False)
599 except Exception: 604 except Exception:
600 # Likely an empty file. 605 # Likely an empty file.
601 return 606 return
602 # TODO: this is the only place where reference_genome is used,
603 # so I'm commenting it out for now. We need to confirm if these
604 # errors that require the reference genmoe being passed are necessary.
605 # If so, we'll need to implement data tables in this tool.
606 # Make sure that the positions in the BED file fall within
607 # the chromosomes provided in the reference sequence.
608 """
609 for mutation_region in range(mutation_regions.shape[0]):
610 mutation_region = mutation_regions.iloc[mutation_region, :]
611 if not (mutation_region[0] in self.reference_genome):
612 self.ofh.write("\nMutation region: %s not found in reference genome.\n" % ' '.join(mutation_region.astype(str)))
613 continue
614 if not isinstance(mutation_region[1], int):
615 self.ofh.write("\nNon-integer found in mutation region start (column 2): %s.\n" % str(mutation_region[1]))
616 break
617 elif not isinstance(mutation_region[2], int):
618 self.ofh.write("\nNon-integer found in mutation region start (column 3): %s.\n" % str(mutation_region[2]))
619 break
620 if mutation_region[1] <= 0 or mutation_region[2] <= 0:
621 self.ofh.write("\nMutation region %s starts before the reference sequence.\n" % ' '.join(mutation_region.astype(str)))
622 if mutation_region[1] > len(self.reference_genome[mutation_region[0]].seq) or mutation_region[2] > len(self.reference_genome[mutation_region[0]].seq):
623 self.ofh.write("\nMutation region %s ends after the reference sequence.\n" % ' '.join(mutation_region.astype(str)))
624 """
625 amr_mutations = pandas.Series(dtype=object) 607 amr_mutations = pandas.Series(dtype=object)
626 for region_i in range(mutation_regions.shape[0]): 608 for region_i in range(mutation_regions.shape[0]):
627 region = mutation_regions.iloc[region_i, :] 609 region = mutation_regions.iloc[region_i, :]
628 region_name = str(region['name']) 610 region_name = str(region['name'])
629 self.ofh.write("Processing mutations for region %s\n" % region_name) 611 self.ofh.write("Processing mutations for region %s\n" % region_name)
660 Table_List = ['Reference contig', 'Position', 'Reference', 'Alternate', 'Drug', 'Note'] 642 Table_List = ['Reference contig', 'Position', 'Reference', 'Alternate', 'Drug', 'Note']
661 for i in range(region_mutations.shape[0]): 643 for i in range(region_mutations.shape[0]):
662 Table_List = Table_List + region_mutations.iloc[i, [0, 1, 3, 4, 5, 6]].values.tolist() 644 Table_List = Table_List + region_mutations.iloc[i, [0, 1, 3, 4, 5, 6]].values.tolist()
663 row_count = int(len(Table_List) / 6) 645 row_count = int(len(Table_List) / 6)
664 self.doc.new_table(columns=6, rows=row_count, text=Table_List, text_align='left') 646 self.doc.new_table(columns=6, rows=row_count, text=Table_List, text_align='left')
647 if os.path.getsize(self.errors_file) > 0:
648 # Report the errors encountered when attempting
649 # to find mutations in the sample.
650 self.doc.new_line()
651 self.doc.new_header(level=2, title=self.mutation_errors_title)
652 with open(self.errors_file, 'r') as efh:
653 for i, line in enumerate(efh):
654 line = line.strip()
655 if line:
656 self.doc.new_line('* %s' % line)
665 method = '%s reads were mapped to the reference sequence using %s.' % (self.read_type, self.minimap2_version) 657 method = '%s reads were mapped to the reference sequence using %s.' % (self.read_type, self.minimap2_version)
666 self.methods[self.mutation_methods_title] = self.methods[self.mutation_methods_title].append(pandas.Series(method)) 658 self.methods[self.mutation_methods_title] = self.methods[self.mutation_methods_title].append(pandas.Series(method))
667 method = 'Mutations were identified using %s and %s.' % (self.samtools_version, self.varscan_version) 659 method = 'Mutations were identified using %s and %s.' % (self.samtools_version, self.varscan_version)
668 self.methods[self.mutation_methods_title] = self.methods[self.mutation_methods_title].append(pandas.Series(method)) 660 self.methods[self.mutation_methods_title] = self.methods[self.mutation_methods_title].append(pandas.Series(method))
669 661
701 if amr_deletions.shape[0] > 0: 693 if amr_deletions.shape[0] > 0:
702 amr_deletions.columns = ['contig', 'start', 'stop', 'name', 'type', 'drug', 'note'] 694 amr_deletions.columns = ['contig', 'start', 'stop', 'name', 'type', 'drug', 'note']
703 amr_deletions = amr_deletions.loc[amr_deletions['type'].isin(['large-deletion', 'any']), :] 695 amr_deletions = amr_deletions.loc[amr_deletions['type'].isin(['large-deletion', 'any']), :]
704 self.doc.new_line() 696 self.doc.new_line()
705 self.doc.new_header(level=2, title=self.large_indel_title) 697 self.doc.new_header(level=2, title=self.large_indel_title)
698 self.doc.new_line('This section is informative only when your idolates were identified as *Bacillus anthracis* strains')
706 for genome in ['Reference insertions', 'Query insertions']: 699 for genome in ['Reference insertions', 'Query insertions']:
707 genome_indels = large_indels[genome].copy() 700 genome_indels = large_indels[genome].copy()
708 self.doc.new_line() 701 self.doc.new_line()
709 self.doc.new_header(level=3, title=genome) 702 self.doc.new_header(level=3, title=genome)
710 if (genome_indels.shape[0] == 0): 703 if (genome_indels.shape[0] == 0):
850 parser.add_argument('--compute_sequence_length_file', action='store', dest='compute_sequence_length_file', help='Comnpute sequence length tabular file') 843 parser.add_argument('--compute_sequence_length_file', action='store', dest='compute_sequence_length_file', help='Comnpute sequence length tabular file')
851 parser.add_argument('--contig_coverage_file', action='store', dest='contig_coverage_file', help='Contig coverage TSV file') 844 parser.add_argument('--contig_coverage_file', action='store', dest='contig_coverage_file', help='Contig coverage TSV file')
852 parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference genome identifier') 845 parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference genome identifier')
853 parser.add_argument('--dnadiff_snps_file', action='store', dest='dnadiff_snps_file', help='DNAdiff snps tabular file') 846 parser.add_argument('--dnadiff_snps_file', action='store', dest='dnadiff_snps_file', help='DNAdiff snps tabular file')
854 parser.add_argument('--dnadiff_version', action='store', dest='dnadiff_version', default=None, help='DNAdiff version string') 847 parser.add_argument('--dnadiff_version', action='store', dest='dnadiff_version', default=None, help='DNAdiff version string')
848 parser.add_argument('--errors_file', action='store', dest='errors_file', default=None, help='AMR mutations errors encountered txt file')
855 parser.add_argument('--feature_bed_dir', action='store', dest='feature_bed_dir', help='Directory of best feature hits bed files') 849 parser.add_argument('--feature_bed_dir', action='store', dest='feature_bed_dir', help='Directory of best feature hits bed files')
856 parser.add_argument('--feature_png_dir', action='store', dest='feature_png_dir', help='Directory of best feature hits png files') 850 parser.add_argument('--feature_png_dir', action='store', dest='feature_png_dir', help='Directory of best feature hits png files')
857 parser.add_argument('--flye_assembly_info_file', action='store', dest='flye_assembly_info_file', default=None, help='Flye assembly info tabular file') 851 parser.add_argument('--flye_assembly_info_file', action='store', dest='flye_assembly_info_file', default=None, help='Flye assembly info tabular file')
858 parser.add_argument('--flye_version', action='store', dest='flye_version', default=None, help='Flye version string') 852 parser.add_argument('--flye_version', action='store', dest='flye_version', default=None, help='Flye version string')
859 parser.add_argument('--genome_insertions_file', action='store', dest='genome_insertions_file', help='Genome insertions BED file') 853 parser.add_argument('--genome_insertions_file', action='store', dest='genome_insertions_file', help='Genome insertions BED file')
865 parser.add_argument('--mutation_regions_bed_file', action='store', dest='mutation_regions_bed_file', help='AMR mutation regions BRD file') 859 parser.add_argument('--mutation_regions_bed_file', action='store', dest='mutation_regions_bed_file', help='AMR mutation regions BRD file')
866 parser.add_argument('--mutation_regions_dir', action='store', dest='mutation_regions_dir', help='Directory of mutation regions TSV files') 860 parser.add_argument('--mutation_regions_dir', action='store', dest='mutation_regions_dir', help='Directory of mutation regions TSV files')
867 parser.add_argument('--pima_css', action='store', dest='pima_css', help='PIMA css stypesheet') 861 parser.add_argument('--pima_css', action='store', dest='pima_css', help='PIMA css stypesheet')
868 parser.add_argument('--plasmids_file', action='store', dest='plasmids_file', help='pChunks plasmids TSV file') 862 parser.add_argument('--plasmids_file', action='store', dest='plasmids_file', help='pChunks plasmids TSV file')
869 parser.add_argument('--quast_report_file', action='store', dest='quast_report_file', help='Quast report tabular file') 863 parser.add_argument('--quast_report_file', action='store', dest='quast_report_file', help='Quast report tabular file')
864 parser.add_argument('--read_type', action='store', dest='read_type', help='Sample read type (ONT or Illumina)')
870 parser.add_argument('--reference_insertions_file', action='store', dest='reference_insertions_file', help='Reference insertions BED file') 865 parser.add_argument('--reference_insertions_file', action='store', dest='reference_insertions_file', help='Reference insertions BED file')
871 parser.add_argument('--samtools_version', action='store', dest='samtools_version', default=None, help='Samtools version string') 866 parser.add_argument('--samtools_version', action='store', dest='samtools_version', default=None, help='Samtools version string')
872 parser.add_argument('--varscan_version', action='store', dest='varscan_version', default=None, help='Varscan version string') 867 parser.add_argument('--varscan_version', action='store', dest='varscan_version', default=None, help='Varscan version string')
873 868
874 args = parser.parse_args() 869 args = parser.parse_args()
910 args.compute_sequence_length_file, 905 args.compute_sequence_length_file,
911 args.contig_coverage_file, 906 args.contig_coverage_file,
912 args.dbkey, 907 args.dbkey,
913 args.dnadiff_snps_file, 908 args.dnadiff_snps_file,
914 args.dnadiff_version, 909 args.dnadiff_version,
910 args.errors_file,
915 feature_bed_files, 911 feature_bed_files,
916 feature_png_files, 912 feature_png_files,
917 args.flye_assembly_info_file, 913 args.flye_assembly_info_file,
918 args.flye_version, 914 args.flye_version,
919 args.genome_insertions_file, 915 args.genome_insertions_file,
925 mutation_regions_files, 921 mutation_regions_files,
926 args.ont_fastq_file, 922 args.ont_fastq_file,
927 args.pima_css, 923 args.pima_css,
928 args.plasmids_file, 924 args.plasmids_file,
929 args.quast_report_file, 925 args.quast_report_file,
926 args.read_type,
930 args.reference_insertions_file, 927 args.reference_insertions_file,
931 args.samtools_version, 928 args.samtools_version,
932 args.varscan_version) 929 args.varscan_version)
933 markdown_report.make_report() 930 markdown_report.make_report()