Mercurial > repos > greg > pima_report
changeset 2:9cb62054a87a draft
Uploaded
author | greg |
---|---|
date | Thu, 09 Mar 2023 16:26:29 +0000 |
parents | 67d0939b56b0 |
children | 33d9832420c5 |
files | pima_report.py pima_report.xml |
diffstat | 2 files changed, 80 insertions(+), 36 deletions(-) [+] |
line wrap: on
line diff
--- a/pima_report.py Tue Mar 07 16:05:14 2023 +0000 +++ b/pima_report.py Thu Mar 09 16:26:29 2023 +0000 @@ -16,11 +16,11 @@ class PimaReport: def __init__(self, analysis_name=None, amr_deletions_file=None, amr_matrix_files=None, assembly_fasta_file=None, - assembly_name=None, compute_sequence_length_file=None, contig_coverage_file=None, dbkey=None, - dnadiff_snps_file=None, feature_bed_files=None, feature_png_files=None, flye_assembly_info_file=None, - flye_version=None, genome_insertions_file=None, gzipped=None, illumina_fastq_file=None, - mutation_regions_bed_file=None, mutation_regions_tsv_files=None, pima_css=None, plasmids_file=None, - reference_insertions_file=None): + assembly_name=None, blastn_version=None, compute_sequence_length_file=None, contig_coverage_file=None, + dbkey=None, dnadiff_snps_file=None, dnadiff_version=None, feature_bed_files=None, feature_png_files=None, + flye_assembly_info_file=None, flye_version=None, genome_insertions_file=None, gzipped=None, + illumina_fastq_file=None, kraken2_report_file=None, kraken2_version=None, mutation_regions_bed_file=None, + mutation_regions_tsv_files=None, pima_css=None, plasmids_file=None, reference_insertions_file=None): self.ofh = open("process_log.txt", "w") self.ofh.write("amr_deletions_file: %s\n" % str(amr_deletions_file)) @@ -28,10 +28,12 @@ self.ofh.write("analysis_name: %s\n" % str(analysis_name)) self.ofh.write("assembly_fasta_file: %s\n" % str(assembly_fasta_file)) self.ofh.write("assembly_name: %s\n" % str(assembly_name)) + self.ofh.write("blastn_version: %s\n" % str(blastn_version)) self.ofh.write("compute_sequence_length_file: %s\n" % str(compute_sequence_length_file)) self.ofh.write("contig_coverage_file: %s\n" % str(contig_coverage_file)) self.ofh.write("dbkey: %s\n" % str(dbkey)) self.ofh.write("dnadiff_snps_file: %s\n" % str(dnadiff_snps_file)) + self.ofh.write("dnadiff_version: %s\n" % str(dnadiff_version)) self.ofh.write("feature_bed_files: %s\n" % str(feature_bed_files)) self.ofh.write("feature_png_files: %s\n" % str(feature_png_files)) self.ofh.write("flye_assembly_info_file: %s\n" % str(flye_assembly_info_file)) @@ -39,6 +41,8 @@ self.ofh.write("gzipped: %s\n" % str(gzipped)) self.ofh.write("genome_insertions_file: %s\n" % str(genome_insertions_file)) self.ofh.write("illumina_fastq_file: %s\n" % str(illumina_fastq_file)) + self.ofh.write("kraken2_report_file: %s\n" % str(kraken2_report_file)) + self.ofh.write("kraken2_version: %s\n" % str(kraken2_version)) self.ofh.write("mutation_regions_bed_file: %s\n" % str(mutation_regions_bed_file)) self.ofh.write("mutation_regions_tsv_files: %s\n" % str(mutation_regions_tsv_files)) self.ofh.write("pima_css: %s\n" % str(pima_css)) @@ -56,10 +60,12 @@ self.analysis_name = analysis_name self.assembly_fasta_file = assembly_fasta_file self.assembly_name = assembly_name + self.blastn_version = blastn_version self.compute_sequence_length_file = compute_sequence_length_file self.contig_coverage_file = contig_coverage_file self.dbkey = dbkey self.dnadiff_snps_file = dnadiff_snps_file + self.dnadiff_version = dnadiff_version self.feature_bed_files = feature_bed_files self.feature_png_files = feature_png_files self.flye_assembly_info_file = flye_assembly_info_file @@ -67,6 +73,8 @@ self.gzipped = gzipped self.genome_insertions_file = genome_insertions_file self.illumina_fastq_file = illumina_fastq_file + self.kraken2_report_file = kraken2_report_file + self.kraken2_version = kraken2_version self.mutation_regions_bed_file = mutation_regions_bed_file self.mutation_regions_tsv_files = mutation_regions_tsv_files self.read_type = 'Illumina' @@ -101,6 +109,9 @@ self.snp_indel_title = 'SNPs and small indels' self.summary_title = 'Analysis of %s' % analysis_name + # Contamination + self.kraken_fracs = pandas.Series(dtype=object) + # Methods self.methods = pandas.Series(dtype='float64') self.methods[self.contamination_methods_title] = pandas.Series(dtype='float64') @@ -110,9 +121,6 @@ self.methods[self.feature_methods_title] = pandas.Series(dtype='float64') self.methods[self.plasmid_methods_title] = pandas.Series(dtype='float64') - # Contamination - self.kraken_fracs = pandas.Series(dtype=object) - # Notes self.assembly_notes = pandas.Series(dtype=object) self.alignment_notes = pandas.Series(dtype=object) @@ -410,12 +418,22 @@ def add_contamination(self): self.ofh.write("\nXXXXXX In add_contamination\n\n") - if len(self.kraken_fracs) == 0: + if self.kraken2_report_file is None: return + # Read in the Kraken fractions and pull out the useful parts + self.kraken_fracs = pandas.read_csv(self.kraken_report_file, delimiter='\t', header=None) + self.kraken_fracs.index = self.kraken_fracs.iloc[:, 4].values + self.kraken_fracs = self.kraken_fracs.loc[self.kraken_fracs.iloc[:, 3].str.match('[UG]1?'), :] + self.kraken_fracs = self.kraken_fracs.loc[(self.kraken_fracs.iloc[:, 0] >= 1) | (self.kraken_fracs.iloc[:, 3] == 'U'), :] + self.kraken_fracs = self.kraken_fracs.iloc[:, [0, 1, 3, 5]] + self.kraken_fracs.columns = ['Fraction', 'Reads', 'Level', 'Taxa'] + self.kraken_fracs['Fraction'] = (self.kraken_fracs['Fraction'] / 100).round(4) + self.kraken_fracs.sort_values(by='Fraction', inplace=True, ascending=False) + self.kraken_fracs['Taxa'] = self.kraken_fracs['Taxa'].str.lstrip() self.doc.new_line() self.doc.new_header(2, 'Contamination check') - for read_type, kraken_fracs in self.kraken_fracs.iteritems(): - self.doc.new_line(read_type + ' classifications') + for kraken_fracs in self.kraken_fracs.iteritems(): + self.doc.new_line(self.read_type + ' classifications') self.doc.new_line() Table_List = ["Percent of Reads", "Reads", "Level", "Label"] for index, row in kraken_fracs.iterrows(): @@ -424,7 +442,7 @@ self.doc.new_table(columns=4, rows=row_count, text=Table_List, text_align='left') if self.contamination_methods_title not in self.methods: self.methods[self.contamination_methods_title] = '' - method = 'Kraken2 was used to assign the raw reads into taxa.' + method = 'Kraken2 version %s was used to assign the raw reads into taxa.' % self.kraken2_version self.methods[self.contamination_methods_title] = self.methods[self.contamination_methods_title].append(pandas.Series(method)) def add_alignment(self): @@ -461,7 +479,7 @@ self.doc.new_line(self.doc.new_inline_image(text='contig_title', path=os.path.abspath(image_png))) self.doc.new_line('<div style="page-break-after: always;"></div>') self.doc.new_line() - method = 'The genome assembly was aligned against the reference sequencing using dnadiff.' + method = 'The genome assembly was aligned against the reference sequencing using dnadiff version %s.' % self.dnadiff_version self.methods[self.reference_methods_title] = self.methods[self.reference_methods_title].append(pandas.Series(method)) def add_features(self): @@ -500,18 +518,16 @@ feature = contig_features.iloc[i, :].copy(deep=True) self.ofh.write("feature: %s\n" % str(feature)) feature[4] = '{:.3f}'.format(feature[4]) - # FIXME: Uncommenting the following line (which is - # https://github.com/appliedbinf/pima_md/blob/main/MarkdownReport.py#L317) - # will cause the job to fail with this exception: - # ValueError: columns * rows is not equal to text length - # Table_List = Table_List + feature[1:].values.tolist() + self.ofh.write("feature[1:].values.tolist(): %s\n" % str(feature[1:].values.tolist())) + Table_List = Table_List + feature[1:].values.tolist() self.ofh.write("Table_List: %s\n" % str(Table_List)) row_count = int(len(Table_List) / 5) self.ofh.write("row_count: %s\n" % str(row_count)) self.doc.new_line() self.ofh.write("Before new_table, len(Table_List):: %s\n" % str(len(Table_List))) self.doc.new_table(columns=5, rows=row_count, text=Table_List, text_align='left') - blastn_version = 'The genome assembly was queried for features using blastn.' + if self.blastn_version is not None: + blastn_version = 'The genome assembly was queried for features using blastn version %s.' % self.blastn_version bedtools_version = 'Feature hits were clustered using bedtools and the highest scoring hit for each cluster was reported.' method = '%s %s' % (blastn_version, bedtools_version) self.methods[self.feature_methods_title] = self.methods[self.feature_methods_title].append(pandas.Series(method)) @@ -581,22 +597,23 @@ region_mutations = pandas.concat([region_mutations, region_mutation_types, region_mutation_drugs, region_notes], axis=1) region_mutations = region_mutations[['#CHROM', 'POS', 'TYPE', 'REF', 'ALT', 'DRUG', 'NOTE']] amr_mutations[region['name']] = region_mutations - # Report the mutations. - self.doc.new_line() - self.doc.new_header(level=2, title=self.mutation_title) - for region_name in amr_mutations.index.tolist(): - region_mutations = amr_mutations[region_name].copy() + if (amr_mutations.shape[0] > 0): + # Report the mutations. self.doc.new_line() - self.doc.new_header(level=3, title=region_name) - if (region_mutations.shape[0] == 0): - self.doc.append('None') - continue - region_mutations.iloc[:, 1] = region_mutations.iloc[:, 1].apply(lambda x: '{:,}'.format(x)) - Table_List = ['Reference contig', 'Position', 'Reference', 'Alternate', 'Drug', 'Note'] - for i in range(region_mutations.shape[0]): - Table_List = Table_List + region_mutations.iloc[i, [0, 1, 3, 4, 5, 6]].values.tolist() - row_count = int(len(Table_List) / 6) - self.doc.new_table(columns=6, rows=row_count, text=Table_List, text_align='left') + self.doc.new_header(level=2, title=self.mutation_title) + for region_name in amr_mutations.index.tolist(): + region_mutations = amr_mutations[region_name].copy() + self.doc.new_line() + self.doc.new_header(level=3, title=region_name) + if (region_mutations.shape[0] == 0): + self.doc.append('None') + continue + region_mutations.iloc[:, 1] = region_mutations.iloc[:, 1].apply(lambda x: '{:,}'.format(x)) + Table_List = ['Reference contig', 'Position', 'Reference', 'Alternate', 'Drug', 'Note'] + for i in range(region_mutations.shape[0]): + Table_List = Table_List + region_mutations.iloc[i, [0, 1, 3, 4, 5, 6]].values.tolist() + row_count = int(len(Table_List) / 6) + self.doc.new_table(columns=6, rows=row_count, text=Table_List, text_align='left') method = '%s reads were mapped to the reference sequence using minimap2.' % self.read_type self.methods[self.mutation_methods_title] = self.methods[self.mutation_methods_title].append(pandas.Series(method)) method = 'Mutations were identified using samtools mpileup and varscan.' @@ -690,7 +707,7 @@ self.doc.new_table(columns=6, rows=row_count, text=Table_List, text_align='left') method = 'The plasmid reference database was queried against the genome assembly using minimap2.' self.methods[self.plasmid_methods_title] = self.methods[self.plasmid_methods_title].append(pandas.Series(method)) - method = 'The resulting SAM was converted to a PSL using a custom version of sam2psl.' + method = 'The resulting BAM was converted to a PSL using a custom version of sam2psl.' self.methods[self.plasmid_methods_title] = self.methods[self.plasmid_methods_title].append(pandas.Series(method)) method = 'Plasmid-to-genome hits were resolved using the pChunks algorithm.' self.methods[self.plasmid_methods_title] = self.methods[self.plasmid_methods_title].append(pandas.Series(method)) @@ -766,7 +783,6 @@ self.add_mutations() self.add_large_indels() self.add_plasmids() - # TODO stuff working to here... self.add_amr_matrix() # self.add_snps() self.add_methods() @@ -790,10 +806,12 @@ parser.add_argument('--analysis_name', action='store', dest='analysis_name', help='Sample identifier') parser.add_argument('--assembly_fasta_file', action='store', dest='assembly_fasta_file', help='Assembly fasta file') parser.add_argument('--assembly_name', action='store', dest='assembly_name', help='Assembly identifier') +parser.add_argument('--blastn_version', action='store', dest='blastn_version', default=None, help='Blastn version string') parser.add_argument('--compute_sequence_length_file', action='store', dest='compute_sequence_length_file', help='Comnpute sequence length tabular file') parser.add_argument('--contig_coverage_file', action='store', dest='contig_coverage_file', help='Contig coverage TSV file') parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference genome identifier') parser.add_argument('--dnadiff_snps_file', action='store', dest='dnadiff_snps_file', help='DNAdiff snps tabular file') +parser.add_argument('--dnadiff_version', action='store', dest='dnadiff_version', help='DNAdiff version string') parser.add_argument('--feature_bed_dir', action='store', dest='feature_bed_dir', help='Directory of best feature hits bed files') parser.add_argument('--feature_png_dir', action='store', dest='feature_png_dir', help='Directory of best feature hits png files') parser.add_argument('--flye_assembly_info_file', action='store', dest='flye_assembly_info_file', default=None, help='Flye assembly info tabular file') @@ -801,6 +819,8 @@ parser.add_argument('--genome_insertions_file', action='store', dest='genome_insertions_file', help='Genome insertions BED file') parser.add_argument('--gzipped', action='store_true', dest='gzipped', default=False, help='Input sample is gzipped') parser.add_argument('--illumina_fastq_file', action='store', dest='illumina_fastq_file', help='Input sample') +parser.add_argument('--kraken2_report_file', action='store', dest='kraken2_report_file', default=None, help='kraken2 report file') +parser.add_argument('--kraken2_version', action='store', dest='kraken2_version', default=None, help='kraken2 version string') parser.add_argument('--mutation_regions_bed_file', action='store', dest='mutation_regions_bed_file', help='AMR mutation regions BRD file') parser.add_argument('--mutation_regions_dir', action='store', dest='mutation_regions_dir', help='Directory of mutation regions TSV files') parser.add_argument('--pima_css', action='store', dest='pima_css', help='PIMA css stypesheet') @@ -836,10 +856,12 @@ amr_matrix_files, args.assembly_fasta_file, args.assembly_name, + args.blastn_version, args.compute_sequence_length_file, args.contig_coverage_file, args.dbkey, args.dnadiff_snps_file, + args.dnadiff_version, feature_bed_files, feature_png_files, args.flye_assembly_info_file, @@ -847,6 +869,8 @@ args.genome_insertions_file, args.gzipped, args.illumina_fastq_file, + args.kraken2_report_file, + args.kraken2_version, args.mutation_regions_bed_file, mutation_regions_files, args.pima_css,
--- a/pima_report.xml Tue Mar 07 16:05:14 2023 +0000 +++ b/pima_report.xml Thu Mar 09 16:26:29 2023 +0000 @@ -9,10 +9,20 @@ #set analysis_name = re.sub('[^\s\w\-]', '_', str($illumina_fastq_file.element_identifier)) #set assembly_name = re.sub('[^\s\w\-]', '_', str($assembly_fasta_file.element_identifier)) + +#if str($blastn_file) not in ['None', '']: + #set blastn_version = re.sub('[^\s\w\-]', '_', str($blastn_file.element_identifier)) +#end if +#set dnadiff_version = re.sub('[^\s\w\-]', '_', str($dnadiff_snps_file.element_identifier)) +#set dnadiff_version = $dnadiff_version.rstrip('(snps)') #if str($flye_assembly_info_file) not in ['None', '']: #set flye_version = re.sub('[^\s\w\-]', '_', str($flye_assembly_info_file.element_identifier)) #set flye_version = $flye_version.rstrip('(assembly info)') #end if +#if str($kraken2_report_file) not in ['None', '']: + #set kraken2_version = re.sub('[^\s\w\-]', '_', str($kraken2_report_file.element_identifier)) + #set kraken2_version = $kraken2_version.rstrip('(report)') +#end if mkdir amr_matrix_png_dir && mkdir feature_bed_dir && @@ -47,10 +57,14 @@ --analysis_name '$analysis_name' --assembly_fasta_file '$assembly_fasta_file' --assembly_name '$assembly_name' +#if str($blastn_file) not in ['None', '']: + --blastn_version '$blastn_version' +#end if --compute_sequence_length_file '$compute_sequence_length_file' --contig_coverage_file '$contig_coverage_file' --dbkey '$aligned_sample.metadata.dbkey' --dnadiff_snps_file '$dnadiff_snps_file' +--dnadiff_version '$dnadiff_version' --feature_bed_dir 'feature_bed_dir' --feature_png_dir 'feature_png_dir' #if str($flye_assembly_info_file) not in ['None', '']: @@ -62,6 +76,10 @@ --gzipped #end if --illumina_fastq_file '$illumina_fastq_file' +#if str($kraken2_report_file) not in ['None', '']: + --kraken2_report_file '$kraken2_report_file' + --kraken2_version '$kraken2_version' +#end if --mutation_regions_dir 'mutation_regions_dir' --mutation_regions_bed_file '$mutation_regions_bed_file' --pima_css '${__tool_directory__}/pima.css' @@ -74,6 +92,7 @@ <param name="aligned_sample" type="data" format="bam" label="Aligned sample BAM file"/> <param name="amr_deletions_file" type="data" format="bed" label="AMR deletions BED file"/> <param name="assembly_fasta_file" type="data" format="fasta" label="Assembly FASTA file"/> + <param name="blastn_file" type="data" format="bam" label="Blastn BAM file"/> <param name="compute_sequence_length_file" type="data" format="tabular,tsv" label="Compute sequence length tabular file"/> <param name="contig_coverage_file" type="data" format="tabular,tsv" label="Contig coverage tabular file"/> <param name="dnadiff_snps_file" type="data" format="tabular" label="DNAdiff snps tabular file"/> @@ -81,6 +100,7 @@ <param name="features_png" format="png" type="data_collection" collection_type="list" label="Collection of best feature hits PNG files"/> <param name="flye_assembly_info_file" type="data" format="tabular,tsv" optional="true" label="Flye assembly info tabular file" help="Optional, ignored if not selected"/> <param name="genome_insertions_file" type="data" format="bed" label="Genome insertions BED file"/> + <param name="kraken2_report_file" type="data" format="tabular,tsv" optional="true" label="Kraken2 report tabular file" help="Optional, ignored if not selected"/> <param name="illumina_fastq_file" type="data" format="fastqsanger,fastqsanger.gz" label="Fastq sample file"/> <param name="mutation_regions" format="tabular,tsv" type="data_collection" collection_type="list" label="Collection of mutation regions tabular files"/> <param name="mutation_regions_bed_file" type="data" format="mutations_regions,bed" label="Mutation regions BED file"/>