Mercurial > repos > greg > pima_report
changeset 21:667b253329c6 draft
Uploaded
author | greg |
---|---|
date | Thu, 13 Apr 2023 17:13:40 +0000 |
parents | 4fe8c35cd176 |
children | 13a9c8ecd30e |
files | pima_report.py pima_report.xml |
diffstat | 2 files changed, 77 insertions(+), 54 deletions(-) [+] |
line wrap: on
line diff
--- a/pima_report.py Thu Mar 30 20:41:18 2023 +0000 +++ b/pima_report.py Thu Apr 13 17:13:40 2023 +0000 @@ -17,20 +17,20 @@ class PimaReport: - def __init__(self, analysis_name=None, amr_deletions_file=None, amr_matrix_files=None, assembly_fasta_file=None, - assembly_name=None, bedtools_version=None, blastn_version=None, circos_files=None, - compute_sequence_length_file=None, contig_coverage_file=None, dbkey=None, dnadiff_snps_file=None, - dnadiff_version=None, errors_file=None, feature_bed_files=None, feature_png_files=None, - flye_assembly_info_file=None, flye_version=None, genome_insertions_file=None, gzipped=None, + def __init__(self, analysis_name=None, amr_deletions_file=None, amr_matrix_files=None, assembler_version=None, + assembly_fasta_file=None, assembly_name=None, bedtools_version=None, blastn_version=None, + circos_files=None, compute_sequence_length_file=None, contig_coverage_file=None, dbkey=None, + dnadiff_snps_file=None, dnadiff_version=None, errors_file=None, fastq_file=None, feature_bed_files=None, + feature_png_files=None, flye_assembly_info_file=None, genome_insertions_file=None, gzipped=None, kraken2_report_file=None, kraken2_version=None, minimap2_version=None, mutation_regions_bed_file=None, - mutation_regions_tsv_files=None, ont_fastq_file=None, pima_css=None, plasmids_file=None, - quast_report_file=None, read_type=None, reference_insertions_file=None, samtools_version=None, - varscan_version=None): + mutation_regions_tsv_files=None, pima_css=None, plasmids_file=None, quast_report_file=None, + read_type=None, reference_insertions_file=None, samtools_version=None, varscan_version=None): self.ofh = open("process_log.txt", "w") self.ofh.write("amr_deletions_file: %s\n" % str(amr_deletions_file)) self.ofh.write("amr_matrix_files: %s\n" % str(amr_matrix_files)) self.ofh.write("analysis_name: %s\n" % str(analysis_name)) + self.ofh.write("assembler_version: %s\n" % str(assembler_version)) self.ofh.write("assembly_fasta_file: %s\n" % str(assembly_fasta_file)) self.ofh.write("assembly_name: %s\n" % str(assembly_name)) self.ofh.write("bedtools_version: %s\n" % str(bedtools_version)) @@ -42,10 +42,10 @@ self.ofh.write("dnadiff_snps_file: %s\n" % str(dnadiff_snps_file)) self.ofh.write("dnadiff_version: %s\n" % str(dnadiff_version)) self.ofh.write("errors_file: %s\n" % str(errors_file)) + self.ofh.write("fastq_file: %s\n" % str(fastq_file)) self.ofh.write("feature_bed_files: %s\n" % str(feature_bed_files)) self.ofh.write("feature_png_files: %s\n" % str(feature_png_files)) self.ofh.write("flye_assembly_info_file: %s\n" % str(flye_assembly_info_file)) - self.ofh.write("flye_version: %s\n" % str(flye_version)) self.ofh.write("gzipped: %s\n" % str(gzipped)) self.ofh.write("genome_insertions_file: %s\n" % str(genome_insertions_file)) self.ofh.write("kraken2_report_file: %s\n" % str(kraken2_report_file)) @@ -53,7 +53,6 @@ self.ofh.write("minimap2_version: %s\n" % str(minimap2_version)) self.ofh.write("mutation_regions_bed_file: %s\n" % str(mutation_regions_bed_file)) self.ofh.write("mutation_regions_tsv_files: %s\n" % str(mutation_regions_tsv_files)) - self.ofh.write("ont_fastq_file: %s\n" % str(ont_fastq_file)) self.ofh.write("pima_css: %s\n" % str(pima_css)) self.ofh.write("plasmids_file: %s\n" % str(plasmids_file)) self.ofh.write("quast_report_file: %s\n" % str(quast_report_file)) @@ -71,6 +70,16 @@ self.amr_matrix_files = amr_matrix_files self.analysis_name = analysis_name.split('_')[0] self.ofh.write("self.analysis_name: %s\n" % str(self.analysis_name)) + if assembler_version is None: + self.assembler_version = 'assembler (version unknown)' + else: + if read_type == 'ont': + # Assembler is flye. + assembler_version = assembler_version.rstrip(' _assembly info_') + else: + # Assembler is spades. + assembler_version = assembler_version.rstrip(' _contigs') + self.assembler_version = re.sub('_', '.', assembler_version) self.assembly_fasta_file = assembly_fasta_file self.assembly_name = re.sub('_', '.', assembly_name.rstrip(' _consensus_')) if bedtools_version is None: @@ -94,10 +103,6 @@ self.feature_bed_files = feature_bed_files self.feature_png_files = feature_png_files self.flye_assembly_info_file = flye_assembly_info_file - if flye_version is None: - self.flye_version = 'flye (version unknown)' - else: - self.flye_version = re.sub('_', '.', flye_version.rstrip(' _assembly info_')) self.gzipped = gzipped self.genome_insertions_file = genome_insertions_file self.kraken2_report_file = kraken2_report_file @@ -146,6 +151,7 @@ self.mutation_methods_title = 'Mutation screening' self.plasmid_methods_title = 'Plasmid annotation' self.plasmid_title = 'Plasmid annotation' + self.reference_genome_title = 'Reference genome' self.reference_methods_title = 'Reference comparison' self.snp_indel_title = 'SNPs and small indels' self.summary_title = 'Summary' @@ -154,6 +160,7 @@ self.methods = pandas.Series(dtype='float64') self.methods[self.contamination_methods_title] = pandas.Series(dtype='float64') self.methods[self.assembly_methods_title] = pandas.Series(dtype='float64') + self.methods[self.reference_genome_title] = pandas.Series(dtype='float64') self.methods[self.reference_methods_title] = pandas.Series(dtype='float64') self.methods[self.mutation_methods_title] = pandas.Series(dtype='float64') self.methods[self.feature_methods_title] = pandas.Series(dtype='float64') @@ -169,7 +176,6 @@ self.contig_info = None self.did_medaka_ont_assembly = False self.feature_hits = pandas.Series(dtype='float64') - self.illumina_fastq_file = None self.illumina_length_mean = None self.illumina_read_count = None self.illumina_bases = None @@ -177,18 +183,26 @@ # TODO: should the following be passed as a parameter? self.ont_coverage_min = 30 self.ont_fast5 = None - self.ont_fastq_file = ont_fastq_file + self.fastq_file = fastq_file self.ont_n50 = None # TODO: should the following be passed as a parameter? self.ont_n50_min = 2500 - self.ont_raw_fastq = self.analysis_name + if self.read_type == 'ONT': + self.ont_raw_fastq = self.analysis_name + self.illumina_fastq = None + else: + self.ont_raw_fastq = None + self.illumina_fastq = self.analysis_name self.ont_read_count = None # Actions self.did_guppy_ont_fast5 = False self.did_qcat_ont_fastq = False - self.info_ont_fastq(self.ont_fastq_file) - self.info_illumina_fastq() + self.ofh.write("self.read_type: %s\n" % str(self.read_type)) + if self.read_type == 'ONT': + self.info_ont_fastq(self.fastq_file) + else: + self.info_illumina_fastq(self.fastq_file) self.load_contig_info() def run_command(self, command): @@ -247,7 +261,7 @@ self.assembly_size += len(i.seq) self.assembly_size = self.format_kmg(self.assembly_size, decimals=1) - def info_illumina_fastq(self): + def info_illumina_fastq(self, fastq_file): self.ofh.write("\nXXXXXX In info_illumina_fastq\n\n") if self.illumina_length_mean is None: return @@ -256,7 +270,7 @@ else: opener = 'cat' command = ' '.join([opener, - self.ont_fastq_file, + fastq_file, '| awk \'{getline;s += length($1);getline;getline;}END{print s/(NR/4)"\t"(NR/4)"\t"s}\'']) output = self.run_command(command) self.ofh.write("output:\n%s\n" % str(output)) @@ -274,9 +288,6 @@ self.illumina_read_count += int(values[1]) self.ofh.write("values[2]:\n%s\n" % str(values[2])) self.illumina_bases += int(values[2]) - # The original PIMA code inserts self.illumina_fastq_file into - # a list for no apparent reason. We don't do that here. - # self.illumina_length_mean /= len(self.illumina_fastq_file) self.illumina_length_mean /= 1 self.illumina_bases = self.format_kmg(self.illumina_bases, decimals=1) @@ -305,7 +316,7 @@ "ONT FASTQ", self.wordwrap_markdown(self.ont_raw_fastq), "Illumina FASTQ", - "N/A", + self.wordwrap_markdown(self.illumina_fastq), "Assembly", self.wordwrap_markdown(self.assembly_name), "Reference", @@ -407,7 +418,8 @@ 'END{printf "%d\\t%d\\t%d\\n", n50, (NR - 1), s;exit}\'']) result = list(re.split('\\t', self.run_command(command)[0])) if result[1] == '0': - self.error_out('No ONT reads found') + warning = 'No ONT reads found' + self.assembly_notes = self.assembly_notes.append(pandas.Series(warning)) self.ont_n50, self.ont_read_count, ont_raw_bases = [int(i) for i in result] command = ' '.join([opener, fastq_file, @@ -530,11 +542,16 @@ contig_title = 'Alignment to %s' % contig self.doc.new_line() self.doc.new_header(level=3, title=contig_title) - self.doc.new_line('Blue indicates aligned sequences (to the reference) and yellow indicates missing sequences') self.doc.new_line(self.doc.new_inline_image(text='contig_title', path=os.path.abspath(circos_file))) self.doc.new_line('<div style="page-break-after: always;"></div>') self.doc.new_line() - method = 'The genome assembly was aligned against the reference sequencing using dnadiff version %s.' % self.dnadiff_version + if self.dbkey == 'ref_genome': + headers = ["* Chromosome - NC_007530.2 Bacillus anthracis str. 'Ames Ancestor', complete sequence", + "* pXO1 - NC_007322.2 Bacillus anthracis str. 'Ames Ancestor' plasmid pXO1, complete sequence", + "* pXO2 - NC_007323.3 Bacillus anthracis str. 'Ames Ancestor' plasmid pXO2, complete sequence"] + method = '\n'.join(headers) + self.methods[self.reference_genome_title] = self.methods[self.reference_genome_title].append(pandas.Series(method)) + method = 'The genome assembly was aligned against the reference sequence using %s.' % self.dnadiff_version self.methods[self.reference_methods_title] = self.methods[self.reference_methods_title].append(pandas.Series(method)) def add_features(self): @@ -780,17 +797,20 @@ self.add_contig_info() self.evaluate_assembly() self.add_assembly_information() - if self.flye_assembly_info_file is not None: - method = 'ONT reads were assembled using %s' % self.flye_version.rstrip('assembly info') - self.methods[self.assembly_methods_title] = self.methods[self.assembly_methods_title].append(pandas.Series(method)) - # Pull in the assembly summary and look at the coverage. - assembly_info = pandas.read_csv(self.flye_assembly_info_file, header=0, index_col=0, sep='\t') - # Look for non-circular contigs. - open_contigs = assembly_info.loc[assembly_info['circ.'] == 'N', :] - if open_contigs.shape[0] > 0: - open_contig_ids = open_contigs.index.values - warning = 'Flye reported {:d} open contigs ({:s}); assembly may be incomplete.'.format(open_contigs.shape[0], ', '.join(open_contig_ids)) - self.assembly_notes = self.assembly_notes.append(pandas.Series(warning)) + if self.assembler_version is not None: + if self.read_type == 'ONT': + method = 'ONT reads were assembled using %s' % self.assembler_version + self.methods[self.assembly_methods_title] = self.methods[self.assembly_methods_title].append(pandas.Series(method)) + # Pull in the assembly summary and look at the coverage. + assembly_info = pandas.read_csv(self.flye_assembly_info_file, header=0, index_col=0, sep='\t') + # Look for non-circular contigs. + open_contigs = assembly_info.loc[assembly_info['circ.'] == 'N', :] + if open_contigs.shape[0] > 0: + open_contig_ids = open_contigs.index.values + warning = 'Flye reported {:d} open contigs ({:s}); assembly may be incomplete.'.format(open_contigs.shape[0], ', '.join(open_contig_ids)) + self.assembly_notes = self.assembly_notes.append(pandas.Series(warning)) + else: + method = 'Illumina reads were assembled using %s' % self.assembler_version if self.did_medaka_ont_assembly: method = 'the genome assembly was polished using ont reads and medaka.' self.methods[self.assembly_methods_title] = self.methods[self.assembly_methods_title].append(pandas.series(method)) @@ -836,6 +856,7 @@ parser.add_argument('--amr_deletions_file', action='store', dest='amr_deletions_file', help='AMR deletions BED file') parser.add_argument('--amr_matrix_png_dir', action='store', dest='amr_matrix_png_dir', help='Directory of AMR matrix PNG files') parser.add_argument('--analysis_name', action='store', dest='analysis_name', help='Sample identifier') +parser.add_argument('--assembler_version', action='store', dest='assembler_version', default=None, help='Assembler version string') parser.add_argument('--assembly_fasta_file', action='store', dest='assembly_fasta_file', help='Assembly fasta file') parser.add_argument('--assembly_name', action='store', dest='assembly_name', help='Assembly identifier') parser.add_argument('--bedtools_version', action='store', dest='bedtools_version', default=None, help='Bedtools version string') @@ -847,13 +868,12 @@ parser.add_argument('--dnadiff_snps_file', action='store', dest='dnadiff_snps_file', help='DNAdiff snps tabular file') parser.add_argument('--dnadiff_version', action='store', dest='dnadiff_version', default=None, help='DNAdiff version string') parser.add_argument('--errors_file', action='store', dest='errors_file', default=None, help='AMR mutations errors encountered txt file') +parser.add_argument('--fastq_file', action='store', dest='fastq_file', help='Input sample') parser.add_argument('--feature_bed_dir', action='store', dest='feature_bed_dir', help='Directory of best feature hits bed files') parser.add_argument('--feature_png_dir', action='store', dest='feature_png_dir', help='Directory of best feature hits png files') parser.add_argument('--flye_assembly_info_file', action='store', dest='flye_assembly_info_file', default=None, help='Flye assembly info tabular file') -parser.add_argument('--flye_version', action='store', dest='flye_version', default=None, help='Flye version string') parser.add_argument('--genome_insertions_file', action='store', dest='genome_insertions_file', help='Genome insertions BED file') parser.add_argument('--gzipped', action='store_true', dest='gzipped', default=False, help='Input sample is gzipped') -parser.add_argument('--ont_fastq_file', action='store', dest='ont_fastq_file', help='Input sample') parser.add_argument('--kraken2_report_file', action='store', dest='kraken2_report_file', default=None, help='kraken2 report file') parser.add_argument('--kraken2_version', action='store', dest='kraken2_version', default=None, help='kraken2 version string') parser.add_argument('--minimap2_version', action='store', dest='minimap2_version', default=None, help='minimap2 version string') @@ -898,6 +918,7 @@ markdown_report = PimaReport(args.analysis_name, args.amr_deletions_file, amr_matrix_files, + args.assembler_version, args.assembly_fasta_file, args.assembly_name, args.bedtools_version, @@ -909,10 +930,10 @@ args.dnadiff_snps_file, args.dnadiff_version, args.errors_file, + args.fastq_file, feature_bed_files, feature_png_files, args.flye_assembly_info_file, - args.flye_version, args.genome_insertions_file, args.gzipped, args.kraken2_report_file, @@ -920,7 +941,6 @@ args.minimap2_version, args.mutation_regions_bed_file, mutation_regions_files, - args.ont_fastq_file, args.pima_css, args.plasmids_file, args.quast_report_file,
--- a/pima_report.xml Thu Mar 30 20:41:18 2023 +0000 +++ b/pima_report.xml Thu Apr 13 17:13:40 2023 +0000 @@ -7,7 +7,7 @@ <command detect_errors="exit_code"><![CDATA[ #import re -#set analysis_name = re.sub('[^\s\w\-]', '_', str($ont_fastq_file.element_identifier)) +#set analysis_name = re.sub('[^\s\w\-]', '_', str($fastq_file.element_identifier)) #set assembly_name = re.sub('[^\s\w\-]', '_', str($assembly_fasta_file.element_identifier)) #if str($bedtools_complementbed_file) not in ['None', '']: @@ -19,8 +19,8 @@ #if str($dnadiff_snps_file) not in ['None', '']: #set dnadiff_version = re.sub('[^\s\w\-]', '_', str($dnadiff_snps_file.element_identifier)) #end if -#if str($flye_assembly_info_file) not in ['None', '']: - #set flye_version = re.sub('[^\s\w\-]', '_', str($flye_assembly_info_file.element_identifier)) +#if str($assembler_version_file) not in ['None', '']: + #set assembler_version = re.sub('[^\s\w\-]', '_', str($assembler_version_file.element_identifier)) #end if #if str($kraken2_report_file) not in ['None', '']: #set kraken2_version = re.sub('[^\s\w\-]', '_', str($kraken2_report_file.element_identifier)) @@ -91,15 +91,18 @@ --errors_file '$errors_file' --feature_bed_dir 'feature_bed_dir' --feature_png_dir 'feature_png_dir' -#if str($flye_assembly_info_file) not in ['None', '']: - --flye_assembly_info_file '$flye_assembly_info_file' - --flye_version '$flye_version' +#if str($assembler_version_file) not in ['None', '']: + --assembler_version '$assembler_version' + #if str($read_type) == 'ont': + ## Need to pass the tabular flye assembly file. + --flye_assembly_info_file '$assembler_version_file' + #end if #end if --genome_insertions_file '$genome_insertions_file' -#if $ont_fastq_file.ext.endswith(".gz"): +#if $fastq_file.ext.endswith(".gz"): --gzipped #end if ---ont_fastq_file '$ont_fastq_file' +--fastq_file '$fastq_file' #if str($kraken2_report_file) not in ['None', '']: --kraken2_report_file '$kraken2_report_file' --kraken2_version '$kraken2_version' @@ -134,12 +137,12 @@ <param name="contig_coverage_file" type="data" format="tabular,tsv" label="Contig coverage tabular file"/> <param name="dnadiff_snps_file" type="data" format="tabular" label="DNAdiff snps tabular file"/> <param name="errors_file" type="data" format="txt" label="AMR mutation regions error txt file"/> + <param name="fastq_file" type="data" format="fastqsanger,fastqsanger.gz" label="Fastq sample file"/> <param name="features_bed" format="bed" type="data_collection" collection_type="list" label="Collection of best feature hits BED files"/> <param name="features_png" format="png" type="data_collection" collection_type="list" label="Collection of best feature hits PNG files"/> - <param name="flye_assembly_info_file" type="data" format="tabular,tsv" optional="true" label="Flye assembly info tabular file" help="Optional, ignored if not selected"/> + <param name="assembler_version_file" type="data" format="fasta,tabular,tsv" optional="true" label="Assembly version file" help="Optional, ignored if not selected"/> <param name="genome_insertions_file" type="data" format="bed" label="Genome insertions BED file"/> <param name="kraken2_report_file" type="data" format="tabular,tsv" optional="true" label="Kraken2 report tabular file" help="Optional, ignored if not selected"/> - <param name="ont_fastq_file" type="data" format="fastqsanger,fastqsanger.gz" label="ONT fastq sample file"/> <param name="minimap2_bam_file" type="data" format="bam" label="Minimap2 BAM file"/> <param name="mutation_regions" format="tabular,tsv" type="data_collection" collection_type="list" label="Collection of mutation regions tabular files"/> <param name="mutation_regions_bed_file" type="data" format="mutations_regions,bed" label="Mutation regions BED file"/> @@ -161,7 +164,7 @@ <param name="aligned_sample" value="aligned_sample.bam" ftype="bam"/> <param name="assembly_fasta_file" value="assembly_fasta.fasta" ftype="fasta"/> <param name="contig_coverage_file" value="contig_coverage.tabular" ftype="tabular"/> - <param name="ont_fastq_file" value="ont_fastq.fastq" ftype="fastq"/> + <param name="fastq_file" value="ont_fastq.fastq" ftype="fastq"/> <output name="output" value="output.pdf" ftype="pdf"/> </test> </tests>