Mercurial > repos > greg > pima_report
changeset 26:46edd7435555 draft
Uploaded
author | greg |
---|---|
date | Tue, 25 Apr 2023 20:31:35 +0000 (20 months ago) |
parents | 4986a7fb2145 |
children | ddc056cf16bf |
files | pima_report.py pima_report.xml |
diffstat | 2 files changed, 120 insertions(+), 85 deletions(-) [+] |
line wrap: on
line diff
--- a/pima_report.py Thu Apr 20 18:47:01 2023 +0000 +++ b/pima_report.py Tue Apr 25 20:31:35 2023 +0000 @@ -20,10 +20,11 @@ def __init__(self, analysis_name=None, amr_deletions_file=None, amr_matrix_files=None, assembler_version=None, assembly_fasta_file=None, assembly_name=None, bedtools_version=None, blastn_version=None, circos_files=None, compute_sequence_length_file=None, contig_coverage_file=None, dbkey=None, - dnadiff_snps_file=None, dnadiff_version=None, errors_file=None, fastq_file=None, feature_bed_files=None, + dnadiff_snps_file=None, dnadiff_version=None, errors_file=None, feature_bed_files=None, feature_png_files=None, flye_assembly_info_file=None, genome_insertions_file=None, gzipped=None, - kraken2_report_file=None, kraken2_version=None, minimap2_version=None, mutation_regions_bed_file=None, - mutation_regions_tsv_files=None, pima_css=None, plasmids_file=None, quast_report_file=None, + illumina_forward_read_file=None, illumina_reverse_read_file=None, kraken2_report_file=None, + kraken2_version=None, minimap2_version=None, mutation_regions_bed_file=None, + mutation_regions_tsv_files=None, ont_file=None, pima_css=None, plasmids_file=None, quast_report_file=None, read_type=None, reference_insertions_file=None, samtools_version=None, varscan_version=None): self.ofh = open("process_log.txt", "w") @@ -42,17 +43,19 @@ self.ofh.write("dnadiff_snps_file: %s\n" % str(dnadiff_snps_file)) self.ofh.write("dnadiff_version: %s\n" % str(dnadiff_version)) self.ofh.write("errors_file: %s\n" % str(errors_file)) - self.ofh.write("fastq_file: %s\n" % str(fastq_file)) self.ofh.write("feature_bed_files: %s\n" % str(feature_bed_files)) self.ofh.write("feature_png_files: %s\n" % str(feature_png_files)) self.ofh.write("flye_assembly_info_file: %s\n" % str(flye_assembly_info_file)) self.ofh.write("gzipped: %s\n" % str(gzipped)) self.ofh.write("genome_insertions_file: %s\n" % str(genome_insertions_file)) + self.ofh.write("illumina_forward_read_file: %s\n" % str(illumina_forward_read_file)) + self.ofh.write("illumina_reverse_read_file: %s\n" % str(illumina_reverse_read_file)) self.ofh.write("kraken2_report_file: %s\n" % str(kraken2_report_file)) self.ofh.write("kraken2_version: %s\n" % str(kraken2_version)) self.ofh.write("minimap2_version: %s\n" % str(minimap2_version)) self.ofh.write("mutation_regions_bed_file: %s\n" % str(mutation_regions_bed_file)) self.ofh.write("mutation_regions_tsv_files: %s\n" % str(mutation_regions_tsv_files)) + self.ofh.write("ont_file: %s\n" % str(ont_file)) self.ofh.write("pima_css: %s\n" % str(pima_css)) self.ofh.write("plasmids_file: %s\n" % str(plasmids_file)) self.ofh.write("quast_report_file: %s\n" % str(quast_report_file)) @@ -105,6 +108,8 @@ self.flye_assembly_info_file = flye_assembly_info_file self.gzipped = gzipped self.genome_insertions_file = genome_insertions_file + self.illumina_forward_read_file = illumina_forward_read_file + self.illumina_reverse_read_file = illumina_reverse_read_file self.kraken2_report_file = kraken2_report_file if kraken2_version is None: self.kraken2_version = 'kraken2 (version unknown)' @@ -174,35 +179,39 @@ # Values self.assembly_size = 0 self.contig_info = None - self.did_medaka_ont_assembly = False self.feature_hits = pandas.Series(dtype='float64') - self.illumina_length_mean = None - self.illumina_read_count = None - self.illumina_bases = None - self.ont_bases = None + self.ont_fast5 = None + self.ont_file = ont_file + self.ont_n50 = None + self.ont_read_count = None # TODO: should the following be passed as a parameter? self.ont_coverage_min = 30 - self.ont_fast5 = None - self.fastq_file = fastq_file - self.ont_n50 = None # TODO: should the following be passed as a parameter? self.ont_n50_min = 2500 + if self.read_type == 'ONT': self.ont_raw_fastq = self.analysis_name + self.ont_bases = 0 + self.illumina_bases = None self.illumina_fastq = None + self.illumina_length_mean = None + self.illumina_read_count = None else: + self.illumina_fastq = self.analysis_name + self.illumina_bases = 0 + self.illumina_length_mean = 0 + self.illumina_read_count = 0 + self.ont_bases = None self.ont_raw_fastq = None - self.illumina_fastq = self.analysis_name - self.ont_read_count = None # Actions self.did_guppy_ont_fast5 = False self.did_qcat_ont_fastq = False self.ofh.write("self.read_type: %s\n" % str(self.read_type)) if self.read_type == 'ONT': - self.info_ont_fastq(self.fastq_file) + self.info_ont_fastq(self.ont_file) else: - self.info_illumina_fastq(self.fastq_file) + self.info_illumina_fastq([self.illumina_forward_read_file, self.illumina_reverse_read_file]) self.load_contig_info() def run_command(self, command): @@ -230,9 +239,9 @@ self.contig_info = pandas.Series(dtype=object) self.contig_info[self.read_type] = pandas.read_csv(self.contig_coverage_file, header=None, index_col=None, sep='\t').sort_values(1, axis=0, ascending=False) self.contig_info[self.read_type].columns = ['contig', 'size', 'coverage'] - self.mean_coverage = (self.contig_info[self.read_type].iloc[:, 1] * self.contig_info[self.read_type].iloc[:, 2]).sum() / self.contig_info[self.read_type].iloc[:, 1].sum() - if self.mean_coverage <= self.ont_coverage_min: - warning = '%s mean coverage ({:.0f}X) is less than the recommended minimum ({:.0f}X).'.format(self.mean_coverage, self.ont_coverage_min) % self.read_type + mean_coverage = (self.contig_info[self.read_type].iloc[:, 1] * self.contig_info[self.read_type].iloc[:, 2]).sum() / self.contig_info[self.read_type].iloc[:, 1].sum() + if mean_coverage <= self.ont_coverage_min: + warning = '%s mean coverage ({:.0f}X) is less than the recommended minimum ({:.0f}X).'.format(mean_coverage, self.ont_coverage_min) % self.read_type self.assembly_notes = self.assembly_notes.append(pandas.Series(warning)) # Report if some contigs have low coverage. low_coverage = self.contig_info[self.read_type].loc[self.contig_info[self.read_type]['coverage'] < self.ont_coverage_min, :] @@ -240,12 +249,12 @@ for contig_i in range(low_coverage.shape[0]): warning = '%s coverage of {:s} ({:.0f}X) is less than the recommended minimum ({:.0f}X).'.format(low_coverage.iloc[contig_i, 0], low_coverage.iloc[contig_i, 2], self.ont_coverage_min) % self.read_type self.assembly_notes = self.assembly_notes.append(pandas.Series(warning)) - # See if some contigs have anolously low coverage. - fold_coverage = self.contig_info[self.read_type]['coverage'] / self.mean_coverage + # See if some contigs have anonymously low coverage. + fold_coverage = self.contig_info[self.read_type]['coverage'] / mean_coverage low_coverage = self.contig_info[self.read_type].loc[fold_coverage < 1 / 5, :] if low_coverage.shape[0] >= 0: for contig_i in range(low_coverage.shape[0]): - warning = '%s coverage of {:s} ({:.0f}X) is less than 1/5 the mean coverage ({:.0f}X).'.format(low_coverage.iloc[contig_i, 0], low_coverage.iloc[contig_i, 2], self.mean_coverage) % self.read_type + warning = '%s coverage of {:s} ({:.0f}X) is less than 1/5 the mean coverage ({:.0f}X).'.format(low_coverage.iloc[contig_i, 0], low_coverage.iloc[contig_i, 2], mean_coverage) % self.read_type self.assembly_notes = self.assembly_notes.append(pandas.Series(warning)) def load_fasta(self, fasta): @@ -257,38 +266,35 @@ def load_assembly(self): self.assembly = self.load_fasta(self.assembly_fasta_file) self.num_assembly_contigs = len(self.assembly) - for i in self.assembly: - self.assembly_size += len(i.seq) - self.assembly_size = self.format_kmg(self.assembly_size, decimals=1) + self.assembly_size = self.format_kmg(sum([len(x) for x in self.assembly]), decimals=1) - def info_illumina_fastq(self, fastq_file): + def info_illumina_fastq(self, illumina_read_files): self.ofh.write("\nXXXXXX In info_illumina_fastq\n\n") - if self.illumina_length_mean is None: - return if self.gzipped: opener = 'gunzip -c' else: opener = 'cat' - command = ' '.join([opener, - fastq_file, - '| awk \'{getline;s += length($1);getline;getline;}END{print s/(NR/4)"\t"(NR/4)"\t"s}\'']) - output = self.run_command(command) - self.ofh.write("output:\n%s\n" % str(output)) - self.ofh.write("re.split('\\t', self.run_command(command)[0]:\n%s\n" % str(re.split('\\t', self.run_command(command)[0]))) - values = [] - for i in re.split('\\t', self.run_command(command)[0]): - if i == '': - values.append(float('nan')) - else: - values.append(float(i)) - self.ofh.write("values:\n%s\n" % str(values)) - self.ofh.write("values[0]:\n%s\n" % str(values[0])) - self.illumina_length_mean += values[0] - self.ofh.write("values[1]:\n%s\n" % str(values[1])) - self.illumina_read_count += int(values[1]) - self.ofh.write("values[2]:\n%s\n" % str(values[2])) - self.illumina_bases += int(values[2]) - self.illumina_length_mean /= 1 + for fastq_file in illumina_read_files: + command = ' '.join([opener, + fastq_file, + '| awk \'{getline;s += length($1);getline;getline;}END{print s/(NR/4)"\t"(NR/4)"\t"s}\'']) + output = self.run_command(command) + self.ofh.write("output:\n%s\n" % str(output)) + self.ofh.write("re.split('\\t', self.run_command(command)[0]:\n%s\n" % str(re.split('\\t', self.run_command(command)[0]))) + values = [] + for i in re.split('\\t', self.run_command(command)[0]): + if i == '': + values.append(float('nan')) + else: + values.append(float(i)) + self.ofh.write("values:\n%s\n" % str(values)) + self.ofh.write("values[0]:\n%s\n" % str(values[0])) + self.illumina_length_mean += values[0] + self.ofh.write("values[1]:\n%s\n" % str(values[1])) + self.illumina_read_count += int(values[1]) + self.ofh.write("values[2]:\n%s\n" % str(values[2])) + self.illumina_bases += int(values[2]) + self.illumina_length_mean /= 2 self.illumina_bases = self.format_kmg(self.illumina_bases, decimals=1) def start_doc(self): @@ -454,20 +460,17 @@ def add_contig_info(self): self.ofh.write("\nXXXXXX In add_contig_info\n\n") - if self.contig_info is None: + if self.contig_info is None or self.read_type not in self.contig_info.index: return - for method in ['ONT', 'Illumina']: - if method not in self.contig_info.index: - continue - self.doc.new_line() - self.doc.new_header(2, 'Assembly coverage by ' + method) - Table_List = ["Contig", "Length (bp)", "Coverage (X)"] - formatted = self.contig_info[method].copy() - formatted.iloc[:, 1] = formatted.iloc[:, 1].apply(lambda x: '{:,}'.format(x)) - for i in range(self.contig_info[method].shape[0]): - Table_List = Table_List + formatted.iloc[i, :].values.tolist() - row_count = int(len(Table_List) / 3) - self.doc.new_table(columns=3, rows=row_count, text=Table_List, text_align='left') + self.doc.new_line() + self.doc.new_header(2, 'Assembly coverage by ' + self.read_type) + Table_List = ["Contig", "Length (bp)", "Coverage (X)"] + formatted = self.contig_info[self.read_type].copy() + formatted.iloc[:, 1] = formatted.iloc[:, 1].apply(lambda x: '{:,}'.format(x)) + for i in range(self.contig_info[self.read_type].shape[0]): + Table_List = Table_List + formatted.iloc[i, :].values.tolist() + row_count = int(len(Table_List) / 3) + self.doc.new_table(columns=3, rows=row_count, text=Table_List, text_align='left') def add_assembly_notes(self): self.ofh.write("\nXXXXXX In add_assembly_notes\n\n") @@ -794,10 +797,10 @@ if self.did_qcat_ont_fastq: methods += ['ONT reads were demultiplexed and trimmed using qcat'] self.methods[self.basecalling_methods_title] = pandas.Series(methods) - # self.add_illumina_library_information() + self.add_illumina_library_information() + self.add_assembly_information() self.add_contig_info() self.evaluate_assembly() - self.add_assembly_information() if self.assembler_version is not None: if self.read_type == 'ONT': method = 'ONT reads were assembled using %s' % self.assembler_version @@ -812,9 +815,8 @@ self.assembly_notes = self.assembly_notes.append(pandas.Series(warning)) else: method = 'Illumina reads were assembled using %s' % self.assembler_version - if self.did_medaka_ont_assembly: - method = 'the genome assembly was polished using ont reads and medaka.' - self.methods[self.assembly_methods_title] = self.methods[self.assembly_methods_title].append(pandas.series(method)) + method = 'The genome assembly was polished using ONT reads and medaka.' + self.methods[self.assembly_methods_title] = self.methods[self.assembly_methods_title].append(pandas.series(method)) self.add_assembly_notes() def make_tex(self): @@ -869,17 +871,19 @@ parser.add_argument('--dnadiff_snps_file', action='store', dest='dnadiff_snps_file', help='DNAdiff snps tabular file') parser.add_argument('--dnadiff_version', action='store', dest='dnadiff_version', default=None, help='DNAdiff version string') parser.add_argument('--errors_file', action='store', dest='errors_file', default=None, help='AMR mutations errors encountered txt file') -parser.add_argument('--fastq_file', action='store', dest='fastq_file', help='Input sample') parser.add_argument('--feature_bed_dir', action='store', dest='feature_bed_dir', help='Directory of best feature hits bed files') parser.add_argument('--feature_png_dir', action='store', dest='feature_png_dir', help='Directory of best feature hits png files') parser.add_argument('--flye_assembly_info_file', action='store', dest='flye_assembly_info_file', default=None, help='Flye assembly info tabular file') parser.add_argument('--genome_insertions_file', action='store', dest='genome_insertions_file', help='Genome insertions BED file') -parser.add_argument('--gzipped', action='store_true', dest='gzipped', default=False, help='Input sample is gzipped') +parser.add_argument('--gzipped', action='store_true', dest='gzipped', default=False, help='Sample(s) is/are gzipped') +parser.add_argument('--illumina_forward_read_file', action='store', dest='illumina_forward_read_file', help='Illumina forward read file') +parser.add_argument('--illumina_reverse_read_file', action='store', dest='illumina_reverse_read_file', help='Illumina reverse read file') parser.add_argument('--kraken2_report_file', action='store', dest='kraken2_report_file', default=None, help='kraken2 report file') parser.add_argument('--kraken2_version', action='store', dest='kraken2_version', default=None, help='kraken2 version string') parser.add_argument('--minimap2_version', action='store', dest='minimap2_version', default=None, help='minimap2 version string') parser.add_argument('--mutation_regions_bed_file', action='store', dest='mutation_regions_bed_file', help='AMR mutation regions BRD file') parser.add_argument('--mutation_regions_dir', action='store', dest='mutation_regions_dir', help='Directory of mutation regions TSV files') +parser.add_argument('--ont_file', action='store', dest='ont_file', help='ONT single read file') parser.add_argument('--pima_css', action='store', dest='pima_css', help='PIMA css stypesheet') parser.add_argument('--plasmids_file', action='store', dest='plasmids_file', default=None, help='pChunks plasmids TSV file') parser.add_argument('--quast_report_file', action='store', dest='quast_report_file', help='Quast report tabular file') @@ -931,17 +935,19 @@ args.dnadiff_snps_file, args.dnadiff_version, args.errors_file, - args.fastq_file, feature_bed_files, feature_png_files, args.flye_assembly_info_file, args.genome_insertions_file, args.gzipped, + args.illumina_forward_read_file, + args.illumina_reverse_read_file, args.kraken2_report_file, args.kraken2_version, args.minimap2_version, args.mutation_regions_bed_file, mutation_regions_files, + args.ont_file, args.pima_css, args.plasmids_file, args.quast_report_file,
--- a/pima_report.xml Thu Apr 20 18:47:01 2023 +0000 +++ b/pima_report.xml Tue Apr 25 20:31:35 2023 +0000 @@ -7,10 +7,10 @@ <command detect_errors="exit_code"><![CDATA[ #import re -#if str($read_type) == 'ont': - #set analysis_name = re.sub('[^\s\w\-]', '_', str($fastq_file.element_identifier)) +#if str($read_type_cond.read_type) == 'ont': + #set analysis_name = re.sub('[^\s\w\-]', '_', str($read_type_cond.ont_file.element_identifier)) #else: - #set analysis_name = re.sub('[^\s\w\-]', '_', str($extracted_fastq_file.element_identifier)) + #set analysis_name = re.sub('[^\s\w\-]', '_', str($read_type_cond.illumina_forward_read.element_identifier)) #end if #set assembly_name = re.sub('[^\s\w\-]', '_', str($assembly_fasta_file.element_identifier)) @@ -23,9 +23,20 @@ #if str($dnadiff_snps_file) not in ['None', '']: #set dnadiff_version = re.sub('[^\s\w\-]', '_', str($dnadiff_snps_file.element_identifier)) #end if -#if str($assembler_version_file) not in ['None', '']: + +## All ONT samples are single-end reads, while all Illumina samples are +## sets if paired reads. For ONT, we need both an assembly_fasta_file +## which is produced by the medaka pipeline and and assembler_version_file +## which is produced by flye. For Illumina we need only the assembly_fasta_file +## which is produced by SPAdes since the version can be derived from it. +#if str($assembler_version_file) in ['None', '']: + ## We're analyzing a set of Illumina paired reads. + #set assembler_version = re.sub('[^\s\w\-]', '_', str($assembly_fasta_file.element_identifier)) +#else: + ## We're analyzing an ONT sample. #set assembler_version = re.sub('[^\s\w\-]', '_', str($assembler_version_file.element_identifier)) #end if + #if str($kraken2_report_file) not in ['None', '']: #set kraken2_version = re.sub('[^\s\w\-]', '_', str($kraken2_report_file.element_identifier)) #end if @@ -97,16 +108,26 @@ --feature_png_dir 'feature_png_dir' #if str($assembler_version_file) not in ['None', '']: --assembler_version '$assembler_version' - #if str($read_type) == 'ont': + #if str($read_type_cond.read_type) == 'ont': ## Need to pass the tabular flye assembly file. --flye_assembly_info_file '$assembler_version_file' #end if #end if --genome_insertions_file '$genome_insertions_file' -#if $fastq_file.ext.endswith(".gz"): - --gzipped +#if str($read_type_cond.read_type) == 'ont': + ## We're analyzing a single-edn ONT sample. + #if $read_type_cond.ont_file.ext.endswith(".gz"): + --gzipped + #end if + --ont_file '$read_type_cond.ont_file' +#else: + ## We're analyzing a set of Illumina paired reads. + #if $read_type_cond.illumina_forward_read.ext.endswith(".gz"): + --gzipped + #end if + --illumina_forward_read_file '$read_type_cond.illumina_forward_read' + --illumina_reverse_read_file '$read_type_cond.illumina_reverse_read' #end if ---fastq_file '$fastq_file' #if str($kraken2_report_file) not in ['None', '']: --kraken2_report_file '$kraken2_report_file' --kraken2_version '$kraken2_version' @@ -119,7 +140,7 @@ --pima_css '${__tool_directory__}/pima.css' --plasmids_file '$plasmids_file' --quast_report_file '$quast_report_file' ---read_type '$read_type' +--read_type '$read_type_cond.read_type' --reference_insertions_file '$reference_insertions_file' #if str($samtools_pileup_file) not in ['None', '']: --samtools_version '$samtools_version' @@ -141,8 +162,6 @@ <param name="contig_coverage_file" type="data" format="tabular,tsv" label="Contig coverage tabular file"/> <param name="dnadiff_snps_file" type="data" format="tabular" label="DNAdiff snps tabular file"/> <param name="errors_file" type="data" format="txt" label="AMR mutation regions error txt file"/> - <param name="extracted_fastq_file" type="data" format="fastqsanger,fastqsanger.gz" optional="true" label="Fastq sample file extracted from a paired collection" help="Used only with Illumina paired reads, leave blank to ignore"/> - <param name="fastq_file" type="data" format="fastqsanger,fastqsanger.gz" label="Fastq sample file"/> <param name="features_bed" format="bed" type="data_collection" collection_type="list" label="Collection of best feature hits BED files"/> <param name="features_png" format="png" type="data_collection" collection_type="list" label="Collection of best feature hits PNG files"/> <param name="assembler_version_file" type="data" format="fasta,tabular,tsv" optional="true" label="Assembly version file" help="Optional, ignored if not selected"/> @@ -152,10 +171,19 @@ <param name="mutation_regions" format="tabular,tsv" type="data_collection" collection_type="list" label="Collection of mutation regions tabular files"/> <param name="mutation_regions_bed_file" type="data" format="mutations_regions,bed" label="Mutation regions BED file"/> <param name="quast_report_file" type="data" format="tabular" label="Quast report tabular file"/> - <param argument="--read_type" type="select" label="Specify the read type"> - <option value="ont" selected="true">Long reads - Oxford Nanopore Technologies (ONT)</option> - <option value="illumina">Short reads - Illumina</option> - </param> + <conditional name="read_type_cond"> + <param argument="--read_type" type="select" label="Specify the read type"> + <option value="ont" selected="true">ONT single read</option> + <option value="illumina">Illumina read pair</option> + </param> + <when value="ont"> + <param name="ont_file" type="data" format="fastqsanger,fastqsanger.gz" label="ONT single read sample file"/> + </when> + <when value="illumina"> + <param name="illumina_forward_read" format="fastqsanger,fastqsanger.gz" type="data" label="Illumina forward read sample file"/> + <param name="illumina_reverse_read" format="fastqsanger,fastqsanger.gz" type="data" label="Illumina reverse read sample file"/> + </when> + </conditional> <param name="reference_insertions_file" type="data" format="bed" label="Reference insertions BED file"/> <param name="plasmids_file" type="data" format="tsv" optional="true" label="pChunks plasmids TSV file" help="Optional, ignored if not selected"/> <param name="samtools_pileup_file" type="data" format="pileup" label="Samtools pileup file"/> @@ -169,7 +197,8 @@ <param name="aligned_sample" value="aligned_sample.bam" ftype="bam"/> <param name="assembly_fasta_file" value="assembly_fasta.fasta" ftype="fasta"/> <param name="contig_coverage_file" value="contig_coverage.tabular" ftype="tabular"/> - <param name="fastq_file" value="ont_fastq.fastq" ftype="fastq"/> + <param name="read_type" value="ont"/> + <param name="ont_file" value="ont_fastq.fastq" ftype="fastq"/> <output name="output" value="output.pdf" ftype="pdf"/> </test> </tests>