Mercurial > repos > greg > vsnp_statistics
changeset 5:d0fbdeaaa488 draft
"planemo upload for repository https://github.com/gregvonkuster/galaxy_tools/tree/master/tools/sequence_analysis/vsnp/vsnp_statistics commit 770e89322a15829580ed9577a853660f63233f32"
author | greg |
---|---|
date | Wed, 16 Jun 2021 17:38:47 +0000 |
parents | 2d6c6b01319e |
children | 429892a80419 |
files | .shed.yml test-data/13-1941-6_S4_L001_R1_600000.fastq.gz test-data/13-1941-6_S4_L001_R2_600000.fastq.gz test-data/Mcap_Deer_DE_SRR650221.fastq.gz test-data/add_zc_metrics.tabular test-data/add_zc_metrics1.tabular test-data/add_zc_metrics2.tabular test-data/add_zc_metrics3.tabular test-data/add_zc_metrics4.tabular test-data/add_zc_metrics5.tabular test-data/samtools_idxstats.tabular test-data/samtools_idxstats1.tabular test-data/samtools_idxstats2.tabular test-data/samtools_idxstats3.tabular test-data/samtools_idxstats4.tabular test-data/samtools_idxstats5.tabular test-data/vsnp_statistics.xlsx test-data/vsnp_statistics1.tabular test-data/vsnp_statistics2.tabular test-data/vsnp_statistics3.tabular test-data/vsnp_statistics4.tabular vsnp_statistics.py vsnp_statistics.xml |
diffstat | 23 files changed, 72 insertions(+), 62 deletions(-) [+] |
line wrap: on
line diff
--- a/.shed.yml Sun Jan 03 15:47:28 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,13 +0,0 @@ -name: vsnp_statistics -owner: greg -description: | - Contains a tool that produces an Excel spreadsheet containing statistics for samples and associated metrics files. -homepage_url: https://github.com/USDA-VS/vSNP -long_description: | - Contains a tool Accepts a single fastqsanger sample, a set of paired read samples, or a collections of samples - along with associated SAMtools idxstats and vSNP zero coverage metrics files and extracts information from them - to produce an Excel spreadsheet containing statistics for each sample. -remote_repository_url: https://github.com/gregvonkuster/galaxy_tools/tree/master/tools/sequence_analysis/vsnp/vsnp_statistics -type: unrestricted -categories: - - Sequence Analysis
--- a/test-data/add_zc_metrics.tabular Sun Jan 03 15:47:28 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3 +0,0 @@ -# File Number of Good SNPs Average Coverage Genome Coverage -MarkDuplicates on data 4_ MarkDuplicates BAM output 10.338671 98.74% -VCFfilter_ on data 7 611
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/add_zc_metrics1.tabular Wed Jun 16 17:38:47 2021 +0000 @@ -0,0 +1,3 @@ +# File Number of Good SNPs Average Coverage Genome Coverage +Mcap_Deer_DE_SRR650221_fastq_gz 0.439436 8.27% +Mcap_Deer_DE_SRR650221_fastq_gz 36
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/add_zc_metrics2.tabular Wed Jun 16 17:38:47 2021 +0000 @@ -0,0 +1,3 @@ +# File Number of Good SNPs Average Coverage Genome Coverage +MarkDuplicates on data 4_ MarkDuplicates BAM output 10.338671 98.74% +VCFfilter_ on data 7 611
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/add_zc_metrics3.tabular Wed Jun 16 17:38:47 2021 +0000 @@ -0,0 +1,3 @@ +# File Number of Good SNPs Average Coverage Genome Coverage +13-1941-6_S4_L001_R1_600000_fastq_gz 0.001252 0.13% +13-1941-6_S4_L001_R1_600000_fastq_gz 0
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/add_zc_metrics4.tabular Wed Jun 16 17:38:47 2021 +0000 @@ -0,0 +1,3 @@ +# File Number of Good SNPs Average Coverage Genome Coverage +Mcap_Deer_DE_SRR650221_fastq_gz 0.439436 8.27% +Mcap_Deer_DE_SRR650221_fastq_gz 36
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/add_zc_metrics5.tabular Wed Jun 16 17:38:47 2021 +0000 @@ -0,0 +1,3 @@ +# File Number of Good SNPs Average Coverage Genome Coverage +13-1941-6_S4_L001_600000_fastq 0.002146 0.16% +13-1941-6_S4_L001_600000_fastq 0
--- a/test-data/samtools_idxstats.tabular Sun Jan 03 15:47:28 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -NC_002945.4 4349904 45 4047 -* 0 0 5
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/samtools_idxstats1.tabular Wed Jun 16 17:38:47 2021 +0000 @@ -0,0 +1,2 @@ +NC_002945.4 4349904 17063 0 +* 0 0 223
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/samtools_idxstats2.tabular Wed Jun 16 17:38:47 2021 +0000 @@ -0,0 +1,2 @@ +NC_002945.4 4349904 45 4047 +* 0 0 5
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/samtools_idxstats3.tabular Wed Jun 16 17:38:47 2021 +0000 @@ -0,0 +1,2 @@ +NC_002945.4 4349904 24 0 +* 0 0 2
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/samtools_idxstats4.tabular Wed Jun 16 17:38:47 2021 +0000 @@ -0,0 +1,2 @@ +NC_002945.4 4349904 17063 0 +* 0 0 223
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/samtools_idxstats5.tabular Wed Jun 16 17:38:47 2021 +0000 @@ -0,0 +1,2 @@ +NC_002945.4 4349904 46 2 +* 0 0 4
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/vsnp_statistics1.tabular Wed Jun 16 17:38:47 2021 +0000 @@ -0,0 +1,2 @@ + Reference File Size Mean Read Length Mean Read Quality Reads Passing Q30 Total Reads All Mapped Reads Unmapped Reads Unmapped Reads Percentage of Total Reference with Coverage Average Depth of Coverage Good SNP Count +Mcap_Deer_DE_SRR650221_fastq_gz 89 1.6 MB 121.0 29.7 0.53 4317 17063 223 0.05 8.27% 0.439436 36
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/vsnp_statistics2.tabular Wed Jun 16 17:38:47 2021 +0000 @@ -0,0 +1,3 @@ + Reference File Size Mean Read Length Mean Read Quality Reads Passing Q30 Total Reads All Mapped Reads Unmapped Reads Unmapped Reads Percentage of Total Reference with Coverage Average Depth of Coverage Good SNP Count +13-1941-6_S4_L001_R1_600000_fastq_gz 89 8.7 KB 100.0 65.7 1.00 25 45 5 0.20 98.74% 10.338671 611 +13-1941-6_S4_L001_R2_600000_fastq_gz 89 8.5 KB 100.0 66.3 1.00 25 45 5 0.20 98.74% 10.338671 611
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/vsnp_statistics3.tabular Wed Jun 16 17:38:47 2021 +0000 @@ -0,0 +1,3 @@ + Reference File Size Mean Read Length Mean Read Quality Reads Passing Q30 Total Reads All Mapped Reads Unmapped Reads Unmapped Reads Percentage of Total Reference with Coverage Average Depth of Coverage Good SNP Count +13-1941-6_S4_L001_R1_600000_fastq_gz 89 8.7 KB 100.0 65.7 1.00 25 24 2 0.08 0.13% 0.001252 0 +Mcap_Deer_DE_SRR650221_fastq_gz 89 1.6 MB 121.0 29.7 0.53 4317 17063 223 0.05 8.27% 0.439436 36
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/vsnp_statistics4.tabular Wed Jun 16 17:38:47 2021 +0000 @@ -0,0 +1,3 @@ + Reference File Size Mean Read Length Mean Read Quality Reads Passing Q30 Total Reads All Mapped Reads Unmapped Reads Unmapped Reads Percentage of Total Reference with Coverage Average Depth of Coverage Good SNP Count +13-1941-6_S4_L001_R1_600000_fastq_gz 89 8.7 KB 100.0 65.7 1.00 25 46 4 0.16 0.16% 0.002146 0 +13-1941-6_S4_L001_R2_600000_fastq_gz 89 8.5 KB 100.0 66.3 1.00 25 46 4 0.16 0.16% 0.002146 0
--- a/vsnp_statistics.py Sun Jan 03 15:47:28 2021 +0000 +++ b/vsnp_statistics.py Wed Jun 16 17:38:47 2021 +0000 @@ -1,32 +1,14 @@ #!/usr/bin/env python import argparse +import csv import gzip import os -import shutil +from functools import partial import numpy import pandas - -QUALITYKEY = {'!': '0', '"': '1', '#': '2', '$': '3', '%': '4', '&': '5', "'": '6', '(': '7', - ')': '8', '*': '9', '+': '10', ',': '11', '-': '12', '.': '13', '/': '14', '0': '15', - '1': '16', '2': '17', '3': '18', '4': '19', '5': '20', '6': '21', '7': '22', - '8': '23', '9': '24', ':': '25', ';': '26', '<': '27', '=': '28', '>': '29', - '?': '30', '@': '31', 'A': '32', 'B': '33', 'C': '34', 'D': '35', 'E': '36', - 'F': '37', 'G': '38', 'H': '39', 'I': '40', 'J': '41', 'K': '42', 'L': '43', - 'M': '44', 'N': '45', 'O': '46', 'P': '47', 'Q': '48', 'R': '49', 'S': '50', - 'T': '51', 'U': '52', 'V': '53', 'W': '54', 'X': '55', 'Y': '56', 'Z': '57', - '_': '1', ']': '1', '[': '1', '\\': '1', '\n': '1', '`': '1', 'a': '1', 'b': '1', - 'c': '1', 'd': '1', 'e': '1', 'f': '1', 'g': '1', 'h': '1', 'i': '1', 'j': '1', - 'k': '1', 'l': '1', 'm': '1', 'n': '1', 'o': '1', 'p': '1', 'q': '1', 'r': '1', - 's': '1', 't': '1', 'u': '1', 'v': '1', 'w': '1', 'x': '1', 'y': '1', 'z': '1', - ' ': '1'} - - -def fastq_to_df(fastq_file, gzipped): - if gzipped: - return pandas.read_csv(gzip.open(fastq_file, "r"), header=None, sep="^") - return pandas.read_csv(open(fastq_file, "r"), header=None, sep="^") +from Bio import SeqIO def nice_size(size): @@ -62,7 +44,20 @@ metrics_file = metrics_files[i] file_name_base = os.path.basename(fastq_file) # Read fastq_file into a data frame. - fastq_df = fastq_to_df(fastq_file, gzipped) + _open = partial(gzip.open, mode='rt') if gzipped else open + with _open(fastq_file) as fh: + identifiers = [] + seqs = [] + letter_annotations = [] + for seq_record in SeqIO.parse(fh, "fastq"): + identifiers.append(seq_record.id) + seqs.append(seq_record.seq) + letter_annotations.append(seq_record.letter_annotations["phred_quality"]) + # Convert lists to Pandas series. + s1 = pandas.Series(identifiers, name='id') + s2 = pandas.Series(seqs, name='seq') + # Gather Series into a data frame. + fastq_df = pandas.DataFrame(dict(id=s1, seq=s2)).set_index(['id']) total_reads = int(len(fastq_df.index) / 4) current_sample_df = pandas.DataFrame(index=[file_name_base], columns=columns) # Reference @@ -76,19 +71,18 @@ fastq_df = fastq_df.iloc[3::4].sample(sampling_size) dict_mean = {} list_length = [] - for index, row in fastq_df.iterrows(): - base_qualities = [] - for base in list(row.array[0]): - base_qualities.append(int(QUALITYKEY[base])) - dict_mean[index] = numpy.mean(base_qualities) - list_length.append(len(row.array[0])) - current_sample_df.at[file_name_base, 'Mean Read Length'] = "%.1f" % numpy.mean(list_length) + i = 0 + for id, seq, in fastq_df.iterrows(): + dict_mean[id] = numpy.mean(letter_annotations[i]) + list_length.append(len(seq.array[0])) + i += 1 + current_sample_df.at[file_name_base, 'Mean Read Length'] = '%.1f' % numpy.mean(list_length) # Mean Read Quality df_mean = pandas.DataFrame.from_dict(dict_mean, orient='index', columns=['ave']) - current_sample_df.at[file_name_base, 'Mean Read Quality'] = "%.1f" % df_mean['ave'].mean() + current_sample_df.at[file_name_base, 'Mean Read Quality'] = '%.1f' % df_mean['ave'].mean() # Reads Passing Q30 reads_gt_q30 = len(df_mean[df_mean['ave'] >= 30]) - reads_passing_q30 = "{:10.2f}".format(reads_gt_q30 / sampling_size) + reads_passing_q30 = '{:10.2f}'.format(reads_gt_q30 / sampling_size) current_sample_df.at[file_name_base, 'Reads Passing Q30'] = reads_passing_q30 # Total Reads current_sample_df.at[file_name_base, 'Total Reads'] = total_reads @@ -99,7 +93,7 @@ current_sample_df.at[file_name_base, 'Unmapped Reads'] = unmapped_reads # Unmapped Reads Percentage of Total if unmapped_reads > 0: - unmapped_reads_percentage = "{:10.2f}".format(unmapped_reads / total_reads) + unmapped_reads_percentage = '{:10.2f}'.format(unmapped_reads / total_reads) else: unmapped_reads_percentage = 0 current_sample_df.at[file_name_base, 'Unmapped Reads Percentage of Total'] = unmapped_reads_percentage @@ -111,12 +105,8 @@ # Good SNP Count current_sample_df.at[file_name_base, 'Good SNP Count'] = good_snp_count data_frames.append(current_sample_df) - excel_df = pandas.concat(data_frames) - excel_file_name = "output.xlsx" - writer = pandas.ExcelWriter(excel_file_name, engine='xlsxwriter') - excel_df.to_excel(writer, sheet_name='Sheet1') - writer.save() - shutil.move(excel_file_name, output_file) + output_df = pandas.concat(data_frames) + output_df.to_csv(output_file, sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\') def process_idxstats_file(idxstats_file): @@ -124,6 +114,7 @@ unmapped_reads = 0 with open(idxstats_file, "r") as fh: for i, line in enumerate(fh): + line = line.rstrip('\r\n') items = line.split("\t") if i == 0: # NC_002945.4 4349904 213570 4047 @@ -143,6 +134,7 @@ if i == 0: # Skip comments. continue + line = line.rstrip('\r\n') items = line.split("\t") if i == 1: # MarkDuplicates 10.338671 98.74%
--- a/vsnp_statistics.xml Sun Jan 03 15:47:28 2021 +0000 +++ b/vsnp_statistics.xml Wed Jun 16 17:38:47 2021 +0000 @@ -4,10 +4,10 @@ <import>macros.xml</import> </macros> <requirements> + <requirement type="package" version="1.78">biopython</requirement> <requirement type="package" version="1.16.5">numpy</requirement> <requirement type="package" version="0.25.3">pandas</requirement> <requirement type="package" version="1.2.0">xlrd</requirement> - <requirement type="package" version="1.2.8">xlsxwriter</requirement> </requirements> <command detect_errors="exit_code"><![CDATA[ #import re @@ -121,7 +121,7 @@ </conditional> </inputs> <outputs> - <data name="output" format="xlsx"/> + <data name="output" format="tabular"/> </outputs> <tests> <!-- A single fastq file --> @@ -131,7 +131,7 @@ <param name="read1" value="Mcap_Deer_DE_SRR650221.fastq.gz" ftype="fastqsanger.gz" dbkey="89"/> <param name="samtools_idxstats" value="samtools_idxstats1.tabular" ftype="tabular" dbkey="89"/> <param name="vsnp_azc" value="add_zc_metrics1.tabular" ftype="tabular" dbkey="89"/> - <output name="output" file="vsnp_statistics1.xlsx" ftype="xlsx" compare="sim_size"/> + <output name="output" file="vsnp_statistics1.tabular" ftype="tabular"/> </test> <!-- A set of paired fastq files --> <test expect_num_outputs="1"> @@ -141,7 +141,7 @@ <param name="read2" value="13-1941-6_S4_L001_R2_600000.fastq.gz" ftype="fastqsanger.gz" dbkey="89"/> <param name="samtools_idxstats" value="samtools_idxstats2.tabular" ftype="tabular" dbkey="89"/> <param name="vsnp_azc" value="add_zc_metrics2.tabular" ftype="tabular" dbkey="89"/> - <output name="output" file="vsnp_statistics2.xlsx" ftype="xlsx" compare="sim_size"/> + <output name="output" file="vsnp_statistics2.tabular" ftype="tabular"/> </test> <!-- A collection of SE fastq files --> <test expect_num_outputs="1"> @@ -165,7 +165,7 @@ <element name="Mcap_Deer_DE_SRR650221.fastq.gz" value="add_zc_metrics4.tabular" dbkey="89"/> </collection> </param> - <output name="output" file="vsnp_statistics3.xlsx" ftype="xlsx" compare="sim_size"/> + <output name="output" file="vsnp_statistics3.tabular" ftype="tabular"/> </test> <!-- A collection of PE fastq files --> <test expect_num_outputs="1"> @@ -187,7 +187,7 @@ <element name="13-1941-6_S4_L001_R1_600000.fastq" value="add_zc_metrics5.tabular" dbkey="89"/> </collection> </param> - <output name="output" file="vsnp_statistics4.xlsx" ftype="xlsx" compare="sim_size"/> + <output name="output" file="vsnp_statistics4.tabular" ftype="tabular"/> </test> </tests> <help>