Mercurial > repos > greg > pima_report
changeset 1:67d0939b56b0 draft
Uploaded
author | greg |
---|---|
date | Tue, 07 Mar 2023 16:05:14 +0000 |
parents | 0a558f444c98 |
children | 9cb62054a87a |
files | pima_report.py pima_report.xml |
diffstat | 2 files changed, 248 insertions(+), 97 deletions(-) [+] |
line wrap: on
line diff
--- a/pima_report.py Fri Mar 03 22:06:23 2023 +0000 +++ b/pima_report.py Tue Mar 07 16:05:14 2023 +0000 @@ -15,37 +15,57 @@ class PimaReport: - def __init__(self, analysis_name=None, assembly_fasta_file=None, assembly_name=None, feature_bed_files=None, feature_png_files=None, - contig_coverage_file=None, dbkey=None, gzipped=None, illumina_fastq_file=None, mutation_regions_bed_file=None, - mutation_regions_tsv_files=None, pima_css=None): + def __init__(self, analysis_name=None, amr_deletions_file=None, amr_matrix_files=None, assembly_fasta_file=None, + assembly_name=None, compute_sequence_length_file=None, contig_coverage_file=None, dbkey=None, + dnadiff_snps_file=None, feature_bed_files=None, feature_png_files=None, flye_assembly_info_file=None, + flye_version=None, genome_insertions_file=None, gzipped=None, illumina_fastq_file=None, + mutation_regions_bed_file=None, mutation_regions_tsv_files=None, pima_css=None, plasmids_file=None, + reference_insertions_file=None): self.ofh = open("process_log.txt", "w") + self.ofh.write("amr_deletions_file: %s\n" % str(amr_deletions_file)) + self.ofh.write("amr_matrix_files: %s\n" % str(amr_matrix_files)) self.ofh.write("analysis_name: %s\n" % str(analysis_name)) self.ofh.write("assembly_fasta_file: %s\n" % str(assembly_fasta_file)) self.ofh.write("assembly_name: %s\n" % str(assembly_name)) + self.ofh.write("compute_sequence_length_file: %s\n" % str(compute_sequence_length_file)) + self.ofh.write("contig_coverage_file: %s\n" % str(contig_coverage_file)) + self.ofh.write("dbkey: %s\n" % str(dbkey)) + self.ofh.write("dnadiff_snps_file: %s\n" % str(dnadiff_snps_file)) self.ofh.write("feature_bed_files: %s\n" % str(feature_bed_files)) self.ofh.write("feature_png_files: %s\n" % str(feature_png_files)) - self.ofh.write("contig_coverage_file: %s\n" % str(contig_coverage_file)) - self.ofh.write("dbkey: %s\n" % str(dbkey)) + self.ofh.write("flye_assembly_info_file: %s\n" % str(flye_assembly_info_file)) + self.ofh.write("flye_version: %s\n" % str(flye_version)) self.ofh.write("gzipped: %s\n" % str(gzipped)) + self.ofh.write("genome_insertions_file: %s\n" % str(genome_insertions_file)) self.ofh.write("illumina_fastq_file: %s\n" % str(illumina_fastq_file)) self.ofh.write("mutation_regions_bed_file: %s\n" % str(mutation_regions_bed_file)) self.ofh.write("mutation_regions_tsv_files: %s\n" % str(mutation_regions_tsv_files)) self.ofh.write("pima_css: %s\n" % str(pima_css)) + self.ofh.write("plasmids_file: %s\n" % str(plasmids_file)) + # self.ofh.write("reference_genome: %s\n" % str(reference_genome)) + self.ofh.write("reference_insertions_file: %s\n" % str(reference_insertions_file)) # General self.doc = None self.report_md = 'pima_report.md' # Inputs + self.amr_deletions_file = amr_deletions_file + self.amr_matrix_files = amr_matrix_files self.analysis_name = analysis_name self.assembly_fasta_file = assembly_fasta_file self.assembly_name = assembly_name + self.compute_sequence_length_file = compute_sequence_length_file + self.contig_coverage_file = contig_coverage_file + self.dbkey = dbkey + self.dnadiff_snps_file = dnadiff_snps_file self.feature_bed_files = feature_bed_files self.feature_png_files = feature_png_files - self.contig_coverage_file = contig_coverage_file - self.dbkey = dbkey + self.flye_assembly_info_file = flye_assembly_info_file + self.flye_version = flye_version self.gzipped = gzipped + self.genome_insertions_file = genome_insertions_file self.illumina_fastq_file = illumina_fastq_file self.mutation_regions_bed_file = mutation_regions_bed_file self.mutation_regions_tsv_files = mutation_regions_tsv_files @@ -54,6 +74,9 @@ self.ont_n50 = None self.ont_read_count = None self.pima_css = pima_css + self.plasmids_file = plasmids_file + # self.reference_genome = reference_genome + self.reference_insertions_file = reference_insertions_file # Titles self.alignment_title = 'Comparison with reference' @@ -73,9 +96,9 @@ self.mutation_title = 'Mutations found in the sample' self.mutation_methods_title = 'Mutation screening' self.plasmid_methods_title = 'Plasmid annotation' + self.plasmid_title = 'Plasmid annotation' self.reference_methods_title = 'Reference comparison' self.snp_indel_title = 'SNPs and small indels' - #self.summary_title = 'Summary' self.summary_title = 'Analysis of %s' % analysis_name # Methods @@ -98,7 +121,6 @@ # Values self.assembly_size = 0 self.contig_info = None - self.did_flye_ont_fastq = False self.did_medaka_ont_assembly = False self.feature_hits = pandas.Series(dtype='float64') self.illumina_length_mean = 0 @@ -106,6 +128,11 @@ self.illumina_bases = 0 self.mean_coverage = 0 self.num_assembly_contigs = 0 + # TODO: should the following 2 values be passed as parameters? + self.ont_n50_min = 2500 + self.ont_coverage_min = 30 + self.quast_indels = 0 + self.quast_mismatches = 0 # Actions self.did_guppy_ont_fast5 = False @@ -139,6 +166,22 @@ self.contig_info[self.read_type] = pandas.read_csv(self.contig_coverage_file, header=None, index_col=None, sep='\t').sort_values(1, axis=0, ascending=False) self.contig_info[self.read_type].columns = ['contig', 'size', 'coverage'] self.mean_coverage = (self.contig_info[self.read_type].iloc[:, 1] * self.contig_info[self.read_type].iloc[:, 2]).sum() / self.contig_info[self.read_type].iloc[:, 1].sum() + if self.mean_coverage <= self.ont_coverage_min: + warning = '%s mean coverage ({:.0f}X) is less than the recommended minimum ({:.0f}X).'.format(self.mean_coverage, self.ont_coverage_min) % self.read_type + self.assembly_notes = self.assembly_notes.append(pandas.Series(warning)) + # Report if some contigs have low coverage. + low_coverage = self.contig_info[self.read_type].loc[self.contig_info[self.read_type]['coverage'] < self.ont_coverage_min, :] + if low_coverage.shape[0] >= 0: + for contig_i in range(low_coverage.shape[0]): + warning = '%s coverage of {:s} ({:.0f}X) is less than the recommended minimum ({:.0f}X).'.format(low_coverage.iloc[contig_i, 0], low_coverage.iloc[contig_i, 2], self.ont_coverage_min) % self.read_type + self.assembly_notes = self.assembly_notes.append(pandas.Series(warning)) + # See if some contigs have anolously low coverage. + fold_coverage = self.contig_info[self.read_type]['coverage'] / self.mean_coverage + low_coverage = self.contig_info[self.read_type].loc[fold_coverage < 1 / 5, :] + if low_coverage.shape[0] >= 0 : + for contig_i in range(low_coverage.shape[0]): + warning = '%s coverage of {:s} ({:.0f}X) is less than 1/5 the mean coverage ({:.0f}X).'.format(low_coverage.iloc[contig_i, 0], low_coverage.iloc[contig_i, 2], self.mean_coverage) % self.read_type + self.assembly_notes = self.assembly_notes.append(pandas.Series(warning)) def load_fasta(self, fasta): sequence = pandas.Series(dtype=object) @@ -185,7 +228,6 @@ self.illumina_bases = self.format_kmg(self.illumina_bases, decimals=1) def start_doc(self): - #header_text = 'Analysis of %s' % self.analysis_name self.doc = MdUtils(file_name=self.report_md, title='') def add_run_information(self): @@ -256,6 +298,20 @@ ] self.doc.new_table(columns=2, rows=4, text=Table_List, text_align='left') + def evaluate_assembly(self) : + assembly_info = pandas.read_csv(self.compute_sequence_length_file, sep='\t', header=None) + assembly_info.columns = ['contig', 'length'] + self.contig_sizes = assembly_info + # Take a look at the number of contigs, their sizes, + # and circularity. Warn if things don't look good. + if assembly_info.shape[0] > 4: + warning = 'Assembly produced {:d} contigs, more than ususally expected; assembly may be fragmented'.format(assembly_info.shape[0]) + self.assembly_notes = self.assembly_notes.append(pandas.Series(warning)) + small_contigs = assembly_info.loc[assembly_info['length'] <= 3000, :] + if small_contigs.shape[0] > 0: + warning = 'Assembly produced {:d} small contigs ({:s}); assembly may include spurious sequences.'.format(small_contigs.shape[0], ', '.join(small_contigs['contig'])) + self.assembly_notes = self.assembly_notes.append(pandas.Series(warning)) + def add_assembly_information(self): self.ofh.write("\nXXXXXX In add_assembly_information\n\n") if self.assembly_fasta_file is None: @@ -291,15 +347,17 @@ if result[1] == '0': self.error_out('No ONT reads found') ont_n50, ont_read_count, ont_raw_bases = [int(i) for i in result] - command = ' '.join([opener, fastq_file, '| awk \'{getline;print length($0);getline;getline;}\'']) result = self.run_command(command) result = list(filter(lambda x: x != '', result)) - ont_read_lengths = [int(i) for i in result] - - return ([ont_n50, ont_read_count, ont_raw_bases, ont_read_lengths]) + # TODO: the following are not yet used... + # ont_read_lengths = [int(i) for i in result] + # ont_bases = self.format_kmg(ont_raw_bases, decimals=1) + if ont_n50 <= self.ont_n50_min: + warning = 'ONT N50 (%s) is less than the recommended minimum (%s)' % (str(ont_n50), str(self.ont_n50_min)) + self.assembly_notes = self.assembly_notes.append(pandas.Series(warning)) def wordwrap_markdown(self, string): if string: @@ -347,8 +405,8 @@ self.doc.new_line('<div style="page-break-after: always;"></div>') self.doc.new_line() self.doc.new_header(2, self.assembly_notes_title) - #for note in self.analysis.assembly_notes: - # self.doc.new_line(note) + for note in self.assembly_notes: + self.doc.new_line(note) def add_contamination(self): self.ofh.write("\nXXXXXX In add_contamination\n\n") @@ -384,11 +442,9 @@ "Category", "Quantity", 'SNPs', - #'{:,}'.format(self.analysis.quast_mismatches), - 'NA' + '{:,}'.format(self.quast_mismatches), 'Small indels', - #'{:,}'.format(self.analysis.quast_indels) - 'NA' + '{:,}'.format(self.quast_indels) ] self.doc.new_table(columns=2, rows=3, text=Table_1, text_align='left') self.doc.new_line('<div style="page-break-after: always;"></div>') @@ -444,12 +500,17 @@ feature = contig_features.iloc[i, :].copy(deep=True) self.ofh.write("feature: %s\n" % str(feature)) feature[4] = '{:.3f}'.format(feature[4]) - Table_List = Table_List + feature[1:].values.tolist() + # FIXME: Uncommenting the following line (which is + # https://github.com/appliedbinf/pima_md/blob/main/MarkdownReport.py#L317) + # will cause the job to fail with this exception: + # ValueError: columns * rows is not equal to text length + # Table_List = Table_List + feature[1:].values.tolist() self.ofh.write("Table_List: %s\n" % str(Table_List)) row_count = int(len(Table_List) / 5) self.ofh.write("row_count: %s\n" % str(row_count)) self.doc.new_line() - self.doc.new_table(columns=7, rows=row_count, text=Table_List, text_align='left') + self.ofh.write("Before new_table, len(Table_List):: %s\n" % str(len(Table_List))) + self.doc.new_table(columns=5, rows=row_count, text=Table_List, text_align='left') blastn_version = 'The genome assembly was queried for features using blastn.' bedtools_version = 'Feature hits were clustered using bedtools and the highest scoring hit for each cluster was reported.' method = '%s %s' % (blastn_version, bedtools_version) @@ -469,11 +530,34 @@ self.ofh.write("\nXXXXXX In add_mutations\n\n") if len(self.mutation_regions_tsv_files) == 0: return - try: + try : mutation_regions = pandas.read_csv(self.mutation_regions_bed_file, sep='\t', header=0, index_col=False) except Exception: # Likely an empty file. return + # TODO: this is the only place where reference_genome is used, + # so I'm commenting it out for now. We need to confirm if these + # errors that require the reference genmoe being passed are necessary. + # If so, we'll need to implement data tables in this tool. + # Make sure that the positions in the BED file fall within + # the chromosomes provided in the reference sequence. + """ + for mutation_region in range(mutation_regions.shape[0]): + mutation_region = mutation_regions.iloc[mutation_region, :] + if not (mutation_region[0] in self.reference_genome): + self.ofh.write("\nMutation region: %s not found in reference genome.\n" % ' '.join(mutation_region.astype(str))) + continue + if not isinstance(mutation_region[1], int): + self.ofh.write("\nNon-integer found in mutation region start (column 2): %s.\n" % str(mutation_region[1])) + break + elif not isinstance(mutation_region[2], int): + self.ofh.write("\nNon-integer found in mutation region start (column 3): %s.\n" % str(mutation_region[2])) + break + if mutation_region[1] <= 0 or mutation_region[2] <= 0: + self.ofh.write("\nMutation region %s starts before the reference sequence.\n" % ' '.join(mutation_region.astype(str))) + if mutation_region[1] > len(self.reference_genome[mutation_region[0]].seq) or mutation_region[2] > len(self.reference_genome[mutation_region[0]].seq): + self.ofh.write("\nMutation region %s ends after the reference sequence.\n" % ' '.join(mutation_region.astype(str))) + """ amr_mutations = pandas.Series(dtype=object) for region_i in range(mutation_regions.shape[0]): region = mutation_regions.iloc[region_i, :] @@ -483,13 +567,13 @@ if region_mutations_tsv_name not in self.mutation_regions_tsv_files: continue region_mutations_tsv = self.mutation_regions_tsv_files[region_mutations_tsv_name] - try: + try : region_mutations = pandas.read_csv(region_mutations_tsv, sep='\t', header=0, index_col=False) except Exception: region_mutations = pandas.DataFrame() if region_mutations.shape[0] == 0: continue - # Figure out what kind of mutations are in this region + # Figure out what kind of mutations are in this region. region_mutation_types = pandas.Series(['snp'] * region_mutations.shape[0], name='TYPE', index=region_mutations.index) region_mutation_types[region_mutations['REF'].str.len() != region_mutations['ALT'].str.len()] = 'small-indel' region_mutation_drugs = pandas.Series(region['drug'] * region_mutations.shape[0], name='DRUG', index=region_mutations.index) @@ -521,78 +605,85 @@ def add_amr_matrix(self): self.ofh.write("\nXXXXXX In add_amr_matrix\n\n") # Make sure that we have an AMR matrix to plot - #if not getattr(self.analysis, 'did_draw_amr_matrix', False): - # return - #amr_matrix_png = self.analysis.amr_matrix_png - #self.doc.new_line() - #self.doc.new_header(level=2, title=self.amr_matrix_title) - #self.doc.new_line('AMR genes and mutations with their corresponding drugs.') - #self.doc.new_line( - # self.doc.new_inline_image( - # text='AMR genes and mutations with their corresponding drugs', - # path=amr_matrix_png - # ) - #) - pass + if len(self.amr_matrix_files) == 0: + return + self.doc.new_line() + self.doc.new_header(level=2, title=self.amr_matrix_title) + self.doc.new_line('AMR genes and mutations with their corresponding drugs') + for amr_matrix_file in self.amr_matrix_files: + self.doc.new_line(self.doc.new_inline_image(text='AMR genes and mutations with their corresponding drugs', + path=os.path.abspath(amr_matrix_file))) def add_large_indels(self): self.ofh.write("\nXXXXXX In add_large_indels\n\n") - # Make sure we looked for mutations - #if len(self.analysis.large_indels) == 0: - # return - #large_indels = self.analysis.large_indels - #self.doc.new_line() - #self.doc.new_header(level=2, title=self.large_indel_title) - #for genome in ['Reference insertions', 'Query insertions']: - # genome_indels = large_indels[genome].copy() - # self.doc.new_line() - # self.doc.new_header(level=3, title=genome) - # if (genome_indels.shape[0] == 0): - # continue - # genome_indels.iloc[:, 1] = genome_indels.iloc[:, 1].apply(lambda x: '{:,}'.format(x)) - # genome_indels.iloc[:, 2] = genome_indels.iloc[:, 2].apply(lambda x: '{:,}'.format(x)) - # genome_indels.iloc[:, 3] = genome_indels.iloc[:, 3].apply(lambda x: '{:,}'.format(x)) - # Table_List = [ - # 'Reference contig', 'Start', 'Stop', 'Size (bp)' - # ] - # for i in range(genome_indels.shape[0]): - # Table_List = Table_List + genome_indels.iloc[i, :].values.tolist() - # row_count = int(len(Table_List) / 4) - # self.doc.new_table(columns=4, rows=row_count, text=Table_List, text_align='left') - #method = 'Large insertions or deletions were found as the complement of aligned regions using bedtools.' - #self.methods[self.reference_methods_title] = self.methods[self.reference_methods_title].append( - # pandas.Series(method)) - #self.doc.new_line() - #self.doc.new_line('<div style="page-break-after: always;"></div>') - #self.doc.new_line() - pass + large_indels = pandas.Series(dtype='float64') + # Pull in insertions. + try: + reference_insertions = pandas.read_csv(filepath_or_buffer=self.reference_insertions_file, sep='\t', header=None) + except Exception: + reference_insertions = pandas.DataFrame() + try: + genome_insertions = pandas.read_csv(filepath_or_buffer=self.genome_insertions_file, sep='\t', header=None) + except Exception: + genome_insertions = pandas.DataFrame() + large_indels['Reference insertions'] = reference_insertions + large_indels['Query insertions'] = genome_insertions + # TODO: we don't seem to be reporting snps and deletions for some reason... + # Pull in the number of SNPs and small indels. + try: + snps = pandas.read_csv(filepath_or_buffer=self.dnadiff_snps_file, sep='\t', header=None) + # TODO: the following is not used... + # small_indels = snps.loc[(snps.iloc[:, 1] == '.') | (snps.iloc[:, 2] == '.'), :] + snps = snps.loc[(snps.iloc[:, 1] != '.') & (snps.iloc[:, 2] != '.'), :] + except Exception: + snps = pandas.DataFrame() + # Pull in deletions. + try: + amr_deletions = pandas.read_csv(filepath_or_buffer=self.amr_deletion_file, sep='\t', header=None) + except Exception: + amr_deletions = pandas.DataFrame() + if amr_deletions.shape[0] > 0: + amr_deletions.columns = ['contig', 'start', 'stop', 'name', 'type', 'drug', 'note'] + amr_deletions = amr_deletions.loc[amr_deletions['type'].isin(['large-deletion', 'any']), :] + self.doc.new_line() + self.doc.new_header(level=2, title=self.large_indel_title) + for genome in ['Reference insertions', 'Query insertions']: + genome_indels = large_indels[genome].copy() + self.doc.new_line() + self.doc.new_header(level=3, title=genome) + if (genome_indels.shape[0] == 0): + continue + genome_indels.iloc[:, 1] = genome_indels.iloc[:, 1].apply(lambda x: '{:,}'.format(x)) + genome_indels.iloc[:, 2] = genome_indels.iloc[:, 2].apply(lambda x: '{:,}'.format(x)) + genome_indels.iloc[:, 3] = genome_indels.iloc[:, 3].apply(lambda x: '{:,}'.format(x)) + Table_List = [ + 'Reference contig', 'Start', 'Stop', 'Size (bp)' + ] + for i in range(genome_indels.shape[0]): + Table_List = Table_List + genome_indels.iloc[i, :].values.tolist() + row_count = int(len(Table_List) / 4) + self.doc.new_table(columns=4, rows=row_count, text=Table_List, text_align='left') + method = 'Large insertions or deletions were found as the complement of aligned regions using bedtools.' + self.methods[self.reference_methods_title] = self.methods[self.reference_methods_title].append(pandas.Series(method)) + self.doc.new_line() + self.doc.new_line('<div style="page-break-after: always;"></div>') + self.doc.new_line() def add_plasmids(self): - self.ofh.write("\nXXXXXX In add_plasmids\n\n") - """ - if not getattr(self.analysis, 'did_call_plasmids', False): - return - # Make sure we looked for mutations - #plasmids = self.analysis.plasmids - if plasmids is None: + try : + plasmids = pandas.read_csv(filepath_or_buffer=self.plasmids_file, sep='\t', header=0) + except Exception: return plasmids = plasmids.copy() self.doc.new_line() - #self.doc.new_header(level=2, title=self.analysis.plasmid_title) + self.doc.new_header(level=2, title=self.plasmid_title) if (plasmids.shape[0] == 0): self.doc.new_line('None') return plasmids.iloc[:, 3] = plasmids.iloc[:, 3].apply(lambda x: '{:,}'.format(x)) plasmids.iloc[:, 4] = plasmids.iloc[:, 4].apply(lambda x: '{:,}'.format(x)) plasmids.iloc[:, 5] = plasmids.iloc[:, 5].apply(lambda x: '{:,}'.format(x)) - Table_List = [ - 'Genome contig', - 'Plasmid hit', - 'Plasmid acc.', - 'Contig size', - 'Aliged', - 'Plasmid size' - ] + Table_List = ['Genome contig', 'Plasmid hit', 'Plasmid acc.', 'Contig size', 'Aliged', 'Plasmid size'] for i in range(plasmids.shape[0]): Table_List = Table_List + plasmids.iloc[i, 0:6].values.tolist() row_count = int(len(Table_List) / 6) @@ -603,8 +694,6 @@ self.methods[self.plasmid_methods_title] = self.methods[self.plasmid_methods_title].append(pandas.Series(method)) method = 'Plasmid-to-genome hits were resolved using the pChunks algorithm.' self.methods[self.plasmid_methods_title] = self.methods[self.plasmid_methods_title].append(pandas.Series(method)) - """ - pass def add_methods(self): self.ofh.write("\nXXXXXX In add_methods\n\n") @@ -614,7 +703,6 @@ return self.doc.new_line() self.doc.new_header(level=2, title=self.methods_title) - for methods_section in self.methods.index.tolist(): if self.methods[methods_section] is None or len(self.methods[methods_section]) == 0: continue @@ -639,15 +727,25 @@ methods += ['ONT reads were demultiplexed and trimmed using qcat'] self.methods[self.basecalling_methods_title] = pandas.Series(methods) self.add_illumina_library_information() + self.add_contig_info() + self.evaluate_assembly() self.add_assembly_information() - self.add_contig_info() - self.add_assembly_notes() - if self.did_flye_ont_fastq: - method = 'ONT reads were assembled using Flye.' + if self.flye_assembly_info_file is not None: + method = 'ONT reads were assembled using %s' % self.flye_version self.methods[self.assembly_methods_title] = self.methods[self.assembly_methods_title].append(pandas.Series(method)) + # Pull in the assembly summary and look at the coverage. + assembly_info = pandas.read_csv(self.flye_assembly_info_file, header=0, index_col=0, sep='\t') + # Look for non-circular contigs. + open_contigs = assembly_info.loc[assembly_info['circ.'] == 'N', :] + if open_contigs.shape[0] > 0: + open_contig_ids = open_contigs.index.values + warning = 'Flye reported {:d} open contigs ({:s}); assembly may be incomplete.'.format(open_contigs.shape[0], ', '.join(open_contig_ids)) + self.assembly_notes = self.assembly_notes.append(pandas.Series(warning)) if self.did_medaka_ont_assembly: method = 'the genome assembly was polished using ont reads and medaka.' self.methods[self.assembly_methods_title] = self.methods[self.assembly_methods_title].append(pandas.series(method)) + self.info_ont_fastq(self.illumina_fastq_file) + self.add_assembly_notes() def make_tex(self): self.doc.new_table_of_contents(table_title='detailed run information', depth=2, marker="tableofcontents") @@ -666,9 +764,9 @@ self.add_features() self.add_feature_plots() self.add_mutations() - # TODO stuff working to here... self.add_large_indels() self.add_plasmids() + # TODO stuff working to here... self.add_amr_matrix() # self.add_snps() self.add_methods() @@ -687,21 +785,36 @@ parser = argparse.ArgumentParser() +parser.add_argument('--amr_deletions_file', action='store', dest='amr_deletions_file', help='AMR deletions BED file') +parser.add_argument('--amr_matrix_png_dir', action='store', dest='amr_matrix_png_dir', help='Directory of AMR matrix PNG files') parser.add_argument('--analysis_name', action='store', dest='analysis_name', help='Sample identifier') parser.add_argument('--assembly_fasta_file', action='store', dest='assembly_fasta_file', help='Assembly fasta file') parser.add_argument('--assembly_name', action='store', dest='assembly_name', help='Assembly identifier') +parser.add_argument('--compute_sequence_length_file', action='store', dest='compute_sequence_length_file', help='Comnpute sequence length tabular file') +parser.add_argument('--contig_coverage_file', action='store', dest='contig_coverage_file', help='Contig coverage TSV file') +parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference genome identifier') +parser.add_argument('--dnadiff_snps_file', action='store', dest='dnadiff_snps_file', help='DNAdiff snps tabular file') parser.add_argument('--feature_bed_dir', action='store', dest='feature_bed_dir', help='Directory of best feature hits bed files') parser.add_argument('--feature_png_dir', action='store', dest='feature_png_dir', help='Directory of best feature hits png files') -parser.add_argument('--contig_coverage_file', action='store', dest='contig_coverage_file', help='Contig coverage TSV file') -parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference genome') +parser.add_argument('--flye_assembly_info_file', action='store', dest='flye_assembly_info_file', default=None, help='Flye assembly info tabular file') +parser.add_argument('--flye_version', action='store', dest='flye_version', default=None, help='Flye version string') +parser.add_argument('--genome_insertions_file', action='store', dest='genome_insertions_file', help='Genome insertions BED file') parser.add_argument('--gzipped', action='store_true', dest='gzipped', default=False, help='Input sample is gzipped') parser.add_argument('--illumina_fastq_file', action='store', dest='illumina_fastq_file', help='Input sample') parser.add_argument('--mutation_regions_bed_file', action='store', dest='mutation_regions_bed_file', help='AMR mutation regions BRD file') parser.add_argument('--mutation_regions_dir', action='store', dest='mutation_regions_dir', help='Directory of mutation regions TSV files') parser.add_argument('--pima_css', action='store', dest='pima_css', help='PIMA css stypesheet') +parser.add_argument('--plasmids_file', action='store', dest='plasmids_file', help='pChunks plasmids TSV file') +parser.add_argument('--reference_insertions_file', action='store', dest='reference_insertions_file', help='Reference insertions BED file') +# parser.add_argument('--reference_genome', action='store', dest='reference_genome', help='Reference genome fasta file') args = parser.parse_args() +# Prepare the AMR matrix PNG files. +amr_matrix_files = [] +for file_name in sorted(os.listdir(args.amr_matrix_png_dir)): + file_path = os.path.abspath(os.path.join(args.amr_matrix_png_dir, file_name)) + amr_matrix_files.append(file_path) # Prepare the features BED files. feature_bed_files = [] for file_name in sorted(os.listdir(args.feature_bed_dir)): @@ -719,15 +832,24 @@ mutation_regions_files.append(file_path) markdown_report = PimaReport(args.analysis_name, + args.amr_deletions_file, + amr_matrix_files, args.assembly_fasta_file, args.assembly_name, + args.compute_sequence_length_file, + args.contig_coverage_file, + args.dbkey, + args.dnadiff_snps_file, feature_bed_files, feature_png_files, - args.contig_coverage_file, - args.dbkey, + args.flye_assembly_info_file, + args.flye_version, + args.genome_insertions_file, args.gzipped, args.illumina_fastq_file, args.mutation_regions_bed_file, mutation_regions_files, - args.pima_css) + args.pima_css, + args.plasmids_file, + args.reference_insertions_file) markdown_report.make_report()
--- a/pima_report.xml Fri Mar 03 22:06:23 2023 +0000 +++ b/pima_report.xml Tue Mar 07 16:05:14 2023 +0000 @@ -9,12 +9,22 @@ #set analysis_name = re.sub('[^\s\w\-]', '_', str($illumina_fastq_file.element_identifier)) #set assembly_name = re.sub('[^\s\w\-]', '_', str($assembly_fasta_file.element_identifier)) +#if str($flye_assembly_info_file) not in ['None', '']: + #set flye_version = re.sub('[^\s\w\-]', '_', str($flye_assembly_info_file.element_identifier)) + #set flye_version = $flye_version.rstrip('(assembly info)') +#end if +mkdir amr_matrix_png_dir && mkdir feature_bed_dir && mkdir feature_png_dir && mkdir mutation_regions_dir && touch 'pima_report.pdf' && +#for $i in $amr_matrices_png: + #set file_name = $i.file_name + #set identifier = re.sub('[^\s\w\-\\.]', '_', str($i.element_identifier)) + ln -s $i 'amr_matrix_png_dir/$identifier' && +#end for #for $i in $features_bed: #set file_name = $i.file_name #set identifier = re.sub('[^\s\w\-\\.]', '_', str($i.element_identifier)) @@ -32,13 +42,22 @@ #end for python '${__tool_directory__}/pima_report.py' +--amr_matrix_png_dir 'amr_matrix_png_dir' +--amr_deletions_file '$amr_deletions_file' --analysis_name '$analysis_name' --assembly_fasta_file '$assembly_fasta_file' --assembly_name '$assembly_name' +--compute_sequence_length_file '$compute_sequence_length_file' --contig_coverage_file '$contig_coverage_file' --dbkey '$aligned_sample.metadata.dbkey' +--dnadiff_snps_file '$dnadiff_snps_file' --feature_bed_dir 'feature_bed_dir' --feature_png_dir 'feature_png_dir' +#if str($flye_assembly_info_file) not in ['None', '']: + --flye_assembly_info_file '$flye_assembly_info_file' + --flye_version '$flye_version' +#end if +--genome_insertions_file '$genome_insertions_file' #if $illumina_fastq_file.ext.endswith(".gz"): --gzipped #end if @@ -46,17 +65,27 @@ --mutation_regions_dir 'mutation_regions_dir' --mutation_regions_bed_file '$mutation_regions_bed_file' --pima_css '${__tool_directory__}/pima.css' +--plasmids_file '$plasmids_file' +--reference_insertions_file '$reference_insertions_file' && mv 'pima_report.pdf' '$output' ]]></command> <inputs> + <param name="amr_matrices_png" format="png" type="data_collection" collection_type="list" label="Collection of AMR matrix PNG files"/> <param name="aligned_sample" type="data" format="bam" label="Aligned sample BAM file"/> + <param name="amr_deletions_file" type="data" format="bed" label="AMR deletions BED file"/> <param name="assembly_fasta_file" type="data" format="fasta" label="Assembly FASTA file"/> + <param name="compute_sequence_length_file" type="data" format="tabular,tsv" label="Compute sequence length tabular file"/> + <param name="contig_coverage_file" type="data" format="tabular,tsv" label="Contig coverage tabular file"/> + <param name="dnadiff_snps_file" type="data" format="tabular" label="DNAdiff snps tabular file"/> <param name="features_bed" format="bed" type="data_collection" collection_type="list" label="Collection of best feature hits BED files"/> <param name="features_png" format="png" type="data_collection" collection_type="list" label="Collection of best feature hits PNG files"/> - <param name="contig_coverage_file" type="data" format="tabular,tsv" label="Contig coverage tabular file"/> + <param name="flye_assembly_info_file" type="data" format="tabular,tsv" optional="true" label="Flye assembly info tabular file" help="Optional, ignored if not selected"/> + <param name="genome_insertions_file" type="data" format="bed" label="Genome insertions BED file"/> <param name="illumina_fastq_file" type="data" format="fastqsanger,fastqsanger.gz" label="Fastq sample file"/> <param name="mutation_regions" format="tabular,tsv" type="data_collection" collection_type="list" label="Collection of mutation regions tabular files"/> <param name="mutation_regions_bed_file" type="data" format="mutations_regions,bed" label="Mutation regions BED file"/> + <param name="reference_insertions_file" type="data" format="bed" label="Reference insertions BED file"/> + <param name="plasmids_file" type="data" format="tsv" label="pChunks plasmids TSV file"/> </inputs> <outputs> <data name="output" format="pdf"/>