Mercurial > repos > greg > vsnp_statistics

--- a/.shed.yml	Sun Jan 03 15:47:28 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,13 +0,0 @@
-name: vsnp_statistics
-owner: greg
-description: |
-  Contains a tool that produces an Excel spreadsheet containing statistics for samples and associated metrics files.
-homepage_url: https://github.com/USDA-VS/vSNP
-long_description: |
-  Contains a tool Accepts a single fastqsanger sample, a set of paired read samples, or a collections of samples
-  along with associated SAMtools idxstats and vSNP zero coverage metrics files and extracts information from them
-  to produce an Excel spreadsheet containing statistics for each sample.
-remote_repository_url: https://github.com/gregvonkuster/galaxy_tools/tree/master/tools/sequence_analysis/vsnp/vsnp_statistics
-type: unrestricted
-categories:
-  - Sequence Analysis
Binary file test-data/13-1941-6_S4_L001_R1_600000.fastq.gz has changed
Binary file test-data/13-1941-6_S4_L001_R2_600000.fastq.gz has changed
Binary file test-data/Mcap_Deer_DE_SRR650221.fastq.gz has changed
--- a/test-data/add_zc_metrics.tabular	Sun Jan 03 15:47:28 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,3 +0,0 @@
-# File	Number of Good SNPs	Average Coverage	Genome Coverage
-MarkDuplicates on data 4_ MarkDuplicates BAM output		10.338671	98.74%
-VCFfilter_ on data 7	611
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/add_zc_metrics1.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,3 @@
+# File	Number of Good SNPs	Average Coverage	Genome Coverage
+Mcap_Deer_DE_SRR650221_fastq_gz		0.439436	8.27%
+Mcap_Deer_DE_SRR650221_fastq_gz	36
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/add_zc_metrics2.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,3 @@
+# File	Number of Good SNPs	Average Coverage	Genome Coverage
+MarkDuplicates on data 4_ MarkDuplicates BAM output		10.338671	98.74%
+VCFfilter_ on data 7	611
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/add_zc_metrics3.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,3 @@
+# File	Number of Good SNPs	Average Coverage	Genome Coverage
+13-1941-6_S4_L001_R1_600000_fastq_gz		0.001252	0.13%
+13-1941-6_S4_L001_R1_600000_fastq_gz	0
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/add_zc_metrics4.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,3 @@
+# File	Number of Good SNPs	Average Coverage	Genome Coverage
+Mcap_Deer_DE_SRR650221_fastq_gz		0.439436	8.27%
+Mcap_Deer_DE_SRR650221_fastq_gz	36
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/add_zc_metrics5.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,3 @@
+# File	Number of Good SNPs	Average Coverage	Genome Coverage
+13-1941-6_S4_L001_600000_fastq		0.002146	0.16%
+13-1941-6_S4_L001_600000_fastq	0
--- a/test-data/samtools_idxstats.tabular	Sun Jan 03 15:47:28 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
-NC_002945.4	4349904	45	4047
-*	0	0	5
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/samtools_idxstats1.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,2 @@
+NC_002945.4	4349904	17063	0
+*	0	0	223
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/samtools_idxstats2.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,2 @@
+NC_002945.4	4349904	45	4047
+*	0	0	5
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/samtools_idxstats3.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,2 @@
+NC_002945.4	4349904	24	0
+*	0	0	2
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/samtools_idxstats4.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,2 @@
+NC_002945.4	4349904	17063	0
+*	0	0	223
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/samtools_idxstats5.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,2 @@
+NC_002945.4	4349904	46	2
+*	0	0	4
Binary file test-data/vsnp_statistics.xlsx has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/vsnp_statistics1.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,2 @@
+	Reference	File Size	Mean Read Length	Mean Read Quality	Reads Passing Q30	Total Reads	All Mapped Reads	Unmapped Reads	Unmapped Reads Percentage of Total	Reference with Coverage	Average Depth of Coverage	Good SNP Count
+Mcap_Deer_DE_SRR650221_fastq_gz	89	1.6 MB	121.0	29.7	      0.53	4317	17063	223	      0.05	8.27%	0.439436	36
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/vsnp_statistics2.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,3 @@
+	Reference	File Size	Mean Read Length	Mean Read Quality	Reads Passing Q30	Total Reads	All Mapped Reads	Unmapped Reads	Unmapped Reads Percentage of Total	Reference with Coverage	Average Depth of Coverage	Good SNP Count
+13-1941-6_S4_L001_R1_600000_fastq_gz	89	8.7 KB	100.0	65.7	      1.00	25	45	5	      0.20	98.74%	10.338671	611
+13-1941-6_S4_L001_R2_600000_fastq_gz	89	8.5 KB	100.0	66.3	      1.00	25	45	5	      0.20	98.74%	10.338671	611
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/vsnp_statistics3.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,3 @@
+	Reference	File Size	Mean Read Length	Mean Read Quality	Reads Passing Q30	Total Reads	All Mapped Reads	Unmapped Reads	Unmapped Reads Percentage of Total	Reference with Coverage	Average Depth of Coverage	Good SNP Count
+13-1941-6_S4_L001_R1_600000_fastq_gz	89	8.7 KB	100.0	65.7	      1.00	25	24	2	      0.08	0.13%	0.001252	0
+Mcap_Deer_DE_SRR650221_fastq_gz	89	1.6 MB	121.0	29.7	      0.53	4317	17063	223	      0.05	8.27%	0.439436	36
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/vsnp_statistics4.tabular	Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,3 @@
+	Reference	File Size	Mean Read Length	Mean Read Quality	Reads Passing Q30	Total Reads	All Mapped Reads	Unmapped Reads	Unmapped Reads Percentage of Total	Reference with Coverage	Average Depth of Coverage	Good SNP Count
+13-1941-6_S4_L001_R1_600000_fastq_gz	89	8.7 KB	100.0	65.7	      1.00	25	46	4	      0.16	0.16%	0.002146	0
+13-1941-6_S4_L001_R2_600000_fastq_gz	89	8.5 KB	100.0	66.3	      1.00	25	46	4	      0.16	0.16%	0.002146	0
--- a/vsnp_statistics.py	Sun Jan 03 15:47:28 2021 +0000
+++ b/vsnp_statistics.py	Wed Jun 16 17:38:47 2021 +0000
@@ -1,32 +1,14 @@
 #!/usr/bin/env python

 import argparse
+import csv
 import gzip
 import os
-import shutil
+from functools import partial

 import numpy
 import pandas
-
-QUALITYKEY = {'!': '0', '"': '1', '#': '2', '$': '3', '%': '4', '&': '5', "'": '6', '(': '7',
-              ')': '8', '*': '9', '+': '10', ',': '11', '-': '12', '.': '13', '/': '14', '0': '15',
-              '1': '16', '2': '17', '3': '18', '4': '19', '5': '20', '6': '21', '7': '22',
-              '8': '23', '9': '24', ':': '25', ';': '26', '<': '27', '=': '28', '>': '29',
-              '?': '30', '@': '31', 'A': '32', 'B': '33', 'C': '34', 'D': '35', 'E': '36',
-              'F': '37', 'G': '38', 'H': '39', 'I': '40', 'J': '41', 'K': '42', 'L': '43',
-              'M': '44', 'N': '45', 'O': '46', 'P': '47', 'Q': '48', 'R': '49', 'S': '50',
-              'T': '51', 'U': '52', 'V': '53', 'W': '54', 'X': '55', 'Y': '56', 'Z': '57',
-              '_': '1', ']': '1', '[': '1', '\\': '1', '\n': '1', '`': '1', 'a': '1', 'b': '1',
-              'c': '1', 'd': '1', 'e': '1', 'f': '1', 'g': '1', 'h': '1', 'i': '1', 'j': '1',
-              'k': '1', 'l': '1', 'm': '1', 'n': '1', 'o': '1', 'p': '1', 'q': '1', 'r': '1',
-              's': '1', 't': '1', 'u': '1', 'v': '1', 'w': '1', 'x': '1', 'y': '1', 'z': '1',
-              ' ': '1'}
-
-
-def fastq_to_df(fastq_file, gzipped):
-    if gzipped:
-        return pandas.read_csv(gzip.open(fastq_file, "r"), header=None, sep="^")
-    return pandas.read_csv(open(fastq_file, "r"), header=None, sep="^")
+from Bio import SeqIO


 def nice_size(size):
@@ -62,7 +44,20 @@
         metrics_file = metrics_files[i]
         file_name_base = os.path.basename(fastq_file)
         # Read fastq_file into a data frame.
-        fastq_df = fastq_to_df(fastq_file, gzipped)
+        _open = partial(gzip.open, mode='rt') if gzipped else open
+        with _open(fastq_file) as fh:
+            identifiers = []
+            seqs = []
+            letter_annotations = []
+            for seq_record in SeqIO.parse(fh, "fastq"):
+                identifiers.append(seq_record.id)
+                seqs.append(seq_record.seq)
+                letter_annotations.append(seq_record.letter_annotations["phred_quality"])
+        # Convert lists to Pandas series.
+        s1 = pandas.Series(identifiers, name='id')
+        s2 = pandas.Series(seqs, name='seq')
+        # Gather Series into a data frame.
+        fastq_df = pandas.DataFrame(dict(id=s1, seq=s2)).set_index(['id'])
         total_reads = int(len(fastq_df.index) / 4)
         current_sample_df = pandas.DataFrame(index=[file_name_base], columns=columns)
         # Reference
@@ -76,19 +71,18 @@
         fastq_df = fastq_df.iloc[3::4].sample(sampling_size)
         dict_mean = {}
         list_length = []
-        for index, row in fastq_df.iterrows():
-            base_qualities = []
-            for base in list(row.array[0]):
-                base_qualities.append(int(QUALITYKEY[base]))
-            dict_mean[index] = numpy.mean(base_qualities)
-            list_length.append(len(row.array[0]))
-        current_sample_df.at[file_name_base, 'Mean Read Length'] = "%.1f" % numpy.mean(list_length)
+        i = 0
+        for id, seq, in fastq_df.iterrows():
+            dict_mean[id] = numpy.mean(letter_annotations[i])
+            list_length.append(len(seq.array[0]))
+            i += 1
+        current_sample_df.at[file_name_base, 'Mean Read Length'] = '%.1f' % numpy.mean(list_length)
         # Mean Read Quality
         df_mean = pandas.DataFrame.from_dict(dict_mean, orient='index', columns=['ave'])
-        current_sample_df.at[file_name_base, 'Mean Read Quality'] = "%.1f" % df_mean['ave'].mean()
+        current_sample_df.at[file_name_base, 'Mean Read Quality'] = '%.1f' % df_mean['ave'].mean()
         # Reads Passing Q30
         reads_gt_q30 = len(df_mean[df_mean['ave'] >= 30])
-        reads_passing_q30 = "{:10.2f}".format(reads_gt_q30 / sampling_size)
+        reads_passing_q30 = '{:10.2f}'.format(reads_gt_q30 / sampling_size)
         current_sample_df.at[file_name_base, 'Reads Passing Q30'] = reads_passing_q30
         # Total Reads
         current_sample_df.at[file_name_base, 'Total Reads'] = total_reads
@@ -99,7 +93,7 @@
         current_sample_df.at[file_name_base, 'Unmapped Reads'] = unmapped_reads
         # Unmapped Reads Percentage of Total
         if unmapped_reads > 0:
-            unmapped_reads_percentage = "{:10.2f}".format(unmapped_reads / total_reads)
+            unmapped_reads_percentage = '{:10.2f}'.format(unmapped_reads / total_reads)
         else:
             unmapped_reads_percentage = 0
         current_sample_df.at[file_name_base, 'Unmapped Reads Percentage of Total'] = unmapped_reads_percentage
@@ -111,12 +105,8 @@
         # Good SNP Count
         current_sample_df.at[file_name_base, 'Good SNP Count'] = good_snp_count
         data_frames.append(current_sample_df)
-    excel_df = pandas.concat(data_frames)
-    excel_file_name = "output.xlsx"
-    writer = pandas.ExcelWriter(excel_file_name, engine='xlsxwriter')
-    excel_df.to_excel(writer, sheet_name='Sheet1')
-    writer.save()
-    shutil.move(excel_file_name, output_file)
+    output_df = pandas.concat(data_frames)
+    output_df.to_csv(output_file, sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\')


 def process_idxstats_file(idxstats_file):
@@ -124,6 +114,7 @@
     unmapped_reads = 0
     with open(idxstats_file, "r") as fh:
         for i, line in enumerate(fh):
+            line = line.rstrip('\r\n')
             items = line.split("\t")
             if i == 0:
                 # NC_002945.4 4349904 213570 4047
@@ -143,6 +134,7 @@
             if i == 0:
                 # Skip comments.
                 continue
+            line = line.rstrip('\r\n')
             items = line.split("\t")
             if i == 1:
                 # MarkDuplicates 10.338671 98.74%
--- a/vsnp_statistics.xml	Sun Jan 03 15:47:28 2021 +0000
+++ b/vsnp_statistics.xml	Wed Jun 16 17:38:47 2021 +0000
@@ -4,10 +4,10 @@
         <import>macros.xml</import>
     </macros>
     <requirements>
+        <requirement type="package" version="1.78">biopython</requirement>
         <requirement type="package" version="1.16.5">numpy</requirement>
         <requirement type="package" version="0.25.3">pandas</requirement>
         <requirement type="package" version="1.2.0">xlrd</requirement>
-        <requirement type="package" version="1.2.8">xlsxwriter</requirement>
     </requirements>
     <command detect_errors="exit_code"><![CDATA[
 #import re
@@ -121,7 +121,7 @@
         </conditional>
     </inputs>
     <outputs>
-        <data name="output" format="xlsx"/>
+        <data name="output" format="tabular"/>
     </outputs>
     <tests>
         <!-- A single fastq file -->
@@ -131,7 +131,7 @@
             <param name="read1" value="Mcap_Deer_DE_SRR650221.fastq.gz" ftype="fastqsanger.gz" dbkey="89"/>
             <param name="samtools_idxstats" value="samtools_idxstats1.tabular" ftype="tabular" dbkey="89"/>
             <param name="vsnp_azc" value="add_zc_metrics1.tabular" ftype="tabular" dbkey="89"/>
-            <output name="output" file="vsnp_statistics1.xlsx" ftype="xlsx" compare="sim_size"/>
+            <output name="output" file="vsnp_statistics1.tabular" ftype="tabular"/>
         </test>
         <!-- A set of paired fastq files -->
         <test expect_num_outputs="1">
@@ -141,7 +141,7 @@
             <param name="read2" value="13-1941-6_S4_L001_R2_600000.fastq.gz" ftype="fastqsanger.gz" dbkey="89"/>
             <param name="samtools_idxstats" value="samtools_idxstats2.tabular" ftype="tabular" dbkey="89"/>
             <param name="vsnp_azc" value="add_zc_metrics2.tabular" ftype="tabular" dbkey="89"/>
-            <output name="output" file="vsnp_statistics2.xlsx" ftype="xlsx" compare="sim_size"/>
+            <output name="output" file="vsnp_statistics2.tabular" ftype="tabular"/>
         </test>
         <!-- A collection of SE fastq files -->
         <test expect_num_outputs="1">
@@ -165,7 +165,7 @@
                     <element name="Mcap_Deer_DE_SRR650221.fastq.gz" value="add_zc_metrics4.tabular" dbkey="89"/>
                 </collection>
             </param>
-            <output name="output" file="vsnp_statistics3.xlsx" ftype="xlsx" compare="sim_size"/>
+            <output name="output" file="vsnp_statistics3.tabular" ftype="tabular"/>
         </test>
         <!-- A collection of PE fastq files -->
         <test expect_num_outputs="1">
@@ -187,7 +187,7 @@
                     <element name="13-1941-6_S4_L001_R1_600000.fastq" value="add_zc_metrics5.tabular" dbkey="89"/>
                 </collection>
             </param>
-            <output name="output" file="vsnp_statistics4.xlsx" ftype="xlsx" compare="sim_size"/>
+            <output name="output" file="vsnp_statistics4.tabular" ftype="tabular"/>
         </test>
     </tests>
     <help>