Mercurial > repos > greg > vsnp_statistics

--- a/macros.xml	Thu Jul 22 18:05:22 2021 +0000
+++ b/macros.xml	Mon Aug 02 13:34:09 2021 +0000
@@ -1,7 +1,31 @@
 <?xml version='1.0' encoding='UTF-8'?>
 <macros>
     <token name="@WRAPPER_VERSION@">1.0</token>
-    <token name="@PROFILE@">19.09</token>
+    <token name="@PROFILE@">21.05</token>
+    <xml name="biopython_requirement">
+        <requirement type="package" version="1.79">biopython</requirement>
+    </xml>
+    <xml name="numpy_requirement">
+        <requirement type="package" version="1.21.1">numpy</requirement>
+    </xml>
+    <xml name="openpyxl_requirement">
+        <requirement type="package" version="3.0.7">openpyxl</requirement>
+    </xml>
+    <xml name="pandas_requirement">
+        <requirement type="package" version="1.3.0">pandas</requirement>
+    </xml>
+    <xml name="pysam_requirement">
+        <requirement type="package" version="0.15.4">pysam</requirement>
+    </xml>
+    <xml name="pyyaml_requirement">
+        <requirement type="package" version="5.3">pyyaml</requirement>
+    </xml>
+    <xml name="xlrd_requirement">
+        <requirement type="package" version="2.0.1">xlrd</requirement>
+    </xml>
+    <xml name="xlsxwriter_requirement">
+        <requirement type="package" version="1.4.4">xlsxwriter</requirement>
+    </xml>
     <xml name="param_reference_source">
         <param name="reference_source" type="select" label="Choose the source for the reference genome">
             <option value="cached" selected="true">locally cached</option>
--- a/test-data/add_zc_metrics3.tabular	Thu Jul 22 18:05:22 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,3 +0,0 @@
-# File	Number of Good SNPs	Average Coverage	Genome Coverage
-13-1941-6_S4_L001_R1_600000_fastq_gz		0.001252	0.13%
-13-1941-6_S4_L001_R1_600000_fastq_gz	0
--- a/test-data/add_zc_metrics4.tabular	Thu Jul 22 18:05:22 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,3 +0,0 @@
-# File	Number of Good SNPs	Average Coverage	Genome Coverage
-Mcap_Deer_DE_SRR650221_fastq_gz		0.439436	8.27%
-Mcap_Deer_DE_SRR650221_fastq_gz	36
--- a/test-data/samtools_idxstats3.tabular	Thu Jul 22 18:05:22 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
-NC_002945.4	4349904	24	0
-*	0	0	2
--- a/test-data/samtools_idxstats4.tabular	Thu Jul 22 18:05:22 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
-NC_002945.4	4349904	17063	0
-*	0	0	223
--- a/test-data/vsnp_statistics1.tabular	Thu Jul 22 18:05:22 2021 +0000
+++ b/test-data/vsnp_statistics1.tabular	Mon Aug 02 13:34:09 2021 +0000
@@ -1,2 +1,2 @@
-	Reference	File Size	Mean Read Length	Mean Read Quality	Reads Passing Q30	Total Reads	All Mapped Reads	Unmapped Reads	Unmapped Reads Percentage of Total	Reference with Coverage	Average Depth of Coverage	Good SNP Count
-Mcap_Deer_DE_SRR650221_fastq_gz	89	1.6 MB	121.0	29.7	      0.53	4317	17063	223	      0.05	8.27%	0.439436	36
+Reference	FASTQ	File Size	Mean Read Length	Mean Read Quality	Reads Passing Q30	Total Reads	All Mapped Reads	Unmapped Reads	Unmapped Reads Percentage of Total	Reference with Coverage	Average Depth of Coverage	Good SNP Count
+89	Mcap_Deer_DE_SRR650221_fastq_gz	1.6 MB	121.0	29.7	      0.53	4317	17063	223	      0.05	8.27%	0.439436	36
--- a/test-data/vsnp_statistics2.tabular	Thu Jul 22 18:05:22 2021 +0000
+++ b/test-data/vsnp_statistics2.tabular	Mon Aug 02 13:34:09 2021 +0000
@@ -1,3 +1,2 @@
-	Reference	File Size	Mean Read Length	Mean Read Quality	Reads Passing Q30	Total Reads	All Mapped Reads	Unmapped Reads	Unmapped Reads Percentage of Total	Reference with Coverage	Average Depth of Coverage	Good SNP Count
-13-1941-6_S4_L001_R1_600000_fastq_gz	89	8.7 KB	100.0	65.7	      1.00	25	45	5	      0.20	98.74%	10.338671	611
-13-1941-6_S4_L001_R2_600000_fastq_gz	89	8.5 KB	100.0	66.3	      1.00	25	45	5	      0.20	98.74%	10.338671	611
+Reference	Read1 FASTQ	File Size	Reads	Mean Read Length	Mean Read Quality	Reads Passing Q30	Read2 FASTQ	File Size	Reads	Mean Read Length	Mean Read Quality	Reads Passing Q30	Total Reads	All Mapped Reads	Unmapped Reads	Unmapped Reads Percentage of Total	Reference with Coverage	Average Depth of Coverage	Good SNP Count
+89	13-1941-6_S4_L001_R1_600000_fastq_gz	8.7 KB	25	100.0	65.7	      1.00	13-1941-6_S4_L001_R2_600000_fastq_gz	8.5 KB	25	100.0	66.3	      1.00	50	45	5	      0.10	98.74%	10.338671	611
--- a/test-data/vsnp_statistics3.tabular	Thu Jul 22 18:05:22 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,3 +0,0 @@
-	Reference	File Size	Mean Read Length	Mean Read Quality	Reads Passing Q30	Total Reads	All Mapped Reads	Unmapped Reads	Unmapped Reads Percentage of Total	Reference with Coverage	Average Depth of Coverage	Good SNP Count
-13-1941-6_S4_L001_R1_600000_fastq_gz	89	8.7 KB	100.0	65.7	      1.00	25	24	2	      0.08	0.13%	0.001252	0
-Mcap_Deer_DE_SRR650221_fastq_gz	89	1.6 MB	121.0	29.7	      0.53	4317	17063	223	      0.05	8.27%	0.439436	36
--- a/test-data/vsnp_statistics4.tabular	Thu Jul 22 18:05:22 2021 +0000
+++ b/test-data/vsnp_statistics4.tabular	Mon Aug 02 13:34:09 2021 +0000
@@ -1,3 +1,2 @@
-	Reference	File Size	Mean Read Length	Mean Read Quality	Reads Passing Q30	Total Reads	All Mapped Reads	Unmapped Reads	Unmapped Reads Percentage of Total	Reference with Coverage	Average Depth of Coverage	Good SNP Count
-13-1941-6_S4_L001_R1_600000_fastq_gz	89	8.7 KB	100.0	65.7	      1.00	25	46	4	      0.16	0.16%	0.002146	0
-13-1941-6_S4_L001_R2_600000_fastq_gz	89	8.5 KB	100.0	66.3	      1.00	25	46	4	      0.16	0.16%	0.002146	0
+Reference	Read1 FASTQ	File Size	Reads	Mean Read Length	Mean Read Quality	Reads Passing Q30	Read2 FASTQ	File Size	Reads	Mean Read Length	Mean Read Quality	Reads Passing Q30	Total Reads	All Mapped Reads	Unmapped Reads	Unmapped Reads Percentage of Total	Reference with Coverage	Average Depth of Coverage	Good SNP Count
+89	13-1941-6_S4_L001_R1_600000_fastq_gz	8.7 KB	25	100.0	65.7	      1.00	13-1941-6_S4_L001_R2_600000_fastq_gz	8.5 KB	25	100.0	66.3	      1.00	50	46	4	      0.08	0.16%	0.002146	0
--- a/vsnp_statistics.py	Thu Jul 22 18:05:22 2021 +0000
+++ b/vsnp_statistics.py	Mon Aug 02 13:34:09 2021 +0000
@@ -1,7 +1,6 @@
 #!/usr/bin/env python

 import argparse
-import csv
 import gzip
 import os
 from functools import partial
@@ -11,6 +10,18 @@
 from Bio import SeqIO


+class Statistics:
+
+    def __init__(self, reference, fastq_file, file_size, total_reads, mean_read_length, mean_read_quality, reads_passing_q30):
+        self.reference = reference
+        self.fastq_file = fastq_file
+        self.file_size = file_size
+        self.total_reads = total_reads
+        self.mean_read_length = mean_read_length
+        self.mean_read_quality = mean_read_quality
+        self.reads_passing_q30 = reads_passing_q30
+
+
 def nice_size(size):
     # Returns a readably formatted string with the size
     words = ['bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB']
@@ -32,81 +43,114 @@
     return '??? bytes'


-def output_statistics(fastq_files, idxstats_files, metrics_files, output_file, gzipped, dbkey):
-    # Produce an Excel spreadsheet that
-    # contains a row for each sample.
-    columns = ['Reference', 'File Size', 'Mean Read Length', 'Mean Read Quality', 'Reads Passing Q30',
-               'Total Reads', 'All Mapped Reads', 'Unmapped Reads', 'Unmapped Reads Percentage of Total',
-               'Reference with Coverage', 'Average Depth of Coverage', 'Good SNP Count']
-    data_frames = []
-    for i, fastq_file in enumerate(fastq_files):
-        idxstats_file = idxstats_files[i]
-        metrics_file = metrics_files[i]
-        file_name_base = os.path.basename(fastq_file)
-        # Read fastq_file into a data frame.
-        _open = partial(gzip.open, mode='rt') if gzipped else open
-        with _open(fastq_file) as fh:
-            identifiers = []
-            seqs = []
-            letter_annotations = []
-            for seq_record in SeqIO.parse(fh, "fastq"):
-                identifiers.append(seq_record.id)
-                seqs.append(seq_record.seq)
-                letter_annotations.append(seq_record.letter_annotations["phred_quality"])
-        # Convert lists to Pandas series.
-        s1 = pandas.Series(identifiers, name='id')
-        s2 = pandas.Series(seqs, name='seq')
-        # Gather Series into a data frame.
-        fastq_df = pandas.DataFrame(dict(id=s1, seq=s2)).set_index(['id'])
-        total_reads = int(len(fastq_df.index) / 4)
-        current_sample_df = pandas.DataFrame(index=[file_name_base], columns=columns)
-        # Reference
-        current_sample_df.at[file_name_base, 'Reference'] = dbkey
-        # File Size
-        current_sample_df.at[file_name_base, 'File Size'] = nice_size(os.path.getsize(fastq_file))
-        # Mean Read Length
-        sampling_size = 10000
-        if sampling_size > total_reads:
-            sampling_size = total_reads
-        fastq_df = fastq_df.iloc[3::4].sample(sampling_size)
-        dict_mean = {}
-        list_length = []
-        i = 0
-        for id, seq, in fastq_df.iterrows():
-            dict_mean[id] = numpy.mean(letter_annotations[i])
-            list_length.append(len(seq.array[0]))
-            i += 1
-        current_sample_df.at[file_name_base, 'Mean Read Length'] = '%.1f' % numpy.mean(list_length)
-        # Mean Read Quality
-        df_mean = pandas.DataFrame.from_dict(dict_mean, orient='index', columns=['ave'])
-        current_sample_df.at[file_name_base, 'Mean Read Quality'] = '%.1f' % df_mean['ave'].mean()
-        # Reads Passing Q30
-        reads_gt_q30 = len(df_mean[df_mean['ave'] >= 30])
-        reads_passing_q30 = '{:10.2f}'.format(reads_gt_q30 / sampling_size)
-        current_sample_df.at[file_name_base, 'Reads Passing Q30'] = reads_passing_q30
+def get_statistics(dbkey, fastq_file, gzipped):
+    sampling_size = 10000
+    # Read fastq_file into a data fram to
+    # get the phred quality scores.
+    _open = partial(gzip.open, mode='rt') if gzipped else open
+    with _open(fastq_file) as fh:
+        identifiers = []
+        seqs = []
+        letter_annotations = []
+        for seq_record in SeqIO.parse(fh, "fastq"):
+            identifiers.append(seq_record.id)
+            seqs.append(seq_record.seq)
+            letter_annotations.append(seq_record.letter_annotations["phred_quality"])
+    # Convert lists to Pandas series.
+    s1 = pandas.Series(identifiers, name='id')
+    s2 = pandas.Series(seqs, name='seq')
+    # Gather Series into a data frame.
+    fastq_df = pandas.DataFrame(dict(id=s1, seq=s2)).set_index(['id'])
+    # Starting at row 3, keep every 4 row
+    # random sample specified number of rows.
+    file_size = nice_size(os.path.getsize(fastq_file))
+    total_reads = int(len(fastq_df.index) / 4)
+    # Mean Read Length
+    if sampling_size > total_reads:
+        sampling_size = total_reads
+    fastq_df = fastq_df.iloc[3::4].sample(sampling_size)
+    dict_mean = {}
+    list_length = []
+    i = 0
+    for id, seq, in fastq_df.iterrows():
+        dict_mean[id] = numpy.mean(letter_annotations[i])
+        list_length.append(len(seq.array[0]))
+        i += 1
+    mean_read_length = '%.1f' % numpy.mean(list_length)
+    # Mean Read Quality
+    df_mean = pandas.DataFrame.from_dict(dict_mean, orient='index', columns=['ave'])
+    mean_read_quality = '%.1f' % df_mean['ave'].mean()
+    # Reads Passing Q30
+    reads_gt_q30 = len(df_mean[df_mean['ave'] >= 30])
+    reads_passing_q30 = '{:10.2f}'.format(reads_gt_q30 / sampling_size)
+    stats = Statistics(dbkey, os.path.basename(fastq_file), file_size, total_reads, mean_read_length,
+                       mean_read_quality, reads_passing_q30)
+    return stats
+
+
+def accrue_statistics(dbkey, read1, read2, gzipped):
+    read1_stats = get_statistics(dbkey, read1, gzipped)
+    if read2 is None:
+        read2_stats = None
+    else:
+        read2_stats = get_statistics(dbkey, read2, gzipped)
+    return read1_stats, read2_stats
+
+
+def output_statistics(read1_stats, read2_stats, idxstats_file, metrics_file, output_file):
+    paired_reads = read2_stats is not None
+    if paired_reads:
+        columns = ['Reference', 'Read1 FASTQ', 'File Size', 'Reads', 'Mean Read Length', 'Mean Read Quality',
+                   'Reads Passing Q30', 'Read2 FASTQ', 'File Size', 'Reads', 'Mean Read Length', 'Mean Read Quality',
+                   'Reads Passing Q30', 'Total Reads', 'All Mapped Reads', 'Unmapped Reads',
+                   'Unmapped Reads Percentage of Total', 'Reference with Coverage', 'Average Depth of Coverage',
+                   'Good SNP Count']
+    else:
+        columns = ['Reference', 'FASTQ', 'File Size', 'Mean Read Length', 'Mean Read Quality', 'Reads Passing Q30',
+                   'Total Reads', 'All Mapped Reads', 'Unmapped Reads', 'Unmapped Reads Percentage of Total',
+                   'Reference with Coverage', 'Average Depth of Coverage', 'Good SNP Count']
+    with open(output_file, "w") as outfh:
+        outfh.write("%s\n" % "\t".join(columns))
+        line_items = []
+        # Get the current stats and associated files.
+        # Get and output the statistics.
+        line_items.append(read1_stats.reference)
+        line_items.append(read1_stats.fastq_file)
+        line_items.append(read1_stats.file_size)
+        if paired_reads:
+            line_items.append(read1_stats.total_reads)
+        line_items.append(read1_stats.mean_read_length)
+        line_items.append(read1_stats.mean_read_quality)
+        line_items.append(read1_stats.reads_passing_q30)
+        if paired_reads:
+            line_items.append(read2_stats.fastq_file)
+            line_items.append(read2_stats.file_size)
+            line_items.append(read2_stats.total_reads)
+            line_items.append(read2_stats.mean_read_length)
+            line_items.append(read2_stats.mean_read_quality)
+            line_items.append(read2_stats.reads_passing_q30)
         # Total Reads
-        current_sample_df.at[file_name_base, 'Total Reads'] = total_reads
+        if paired_reads:
+            total_reads = read1_stats.total_reads + read2_stats.total_reads
+        else:
+            total_reads = read1_stats.total_reads
+        line_items.append(total_reads)
         # All Mapped Reads
         all_mapped_reads, unmapped_reads = process_idxstats_file(idxstats_file)
-        current_sample_df.at[file_name_base, 'All Mapped Reads'] = all_mapped_reads
-        # Unmapped Reads
-        current_sample_df.at[file_name_base, 'Unmapped Reads'] = unmapped_reads
+        line_items.append(all_mapped_reads)
+        line_items.append(unmapped_reads)
         # Unmapped Reads Percentage of Total
         if unmapped_reads > 0:
             unmapped_reads_percentage = '{:10.2f}'.format(unmapped_reads / total_reads)
         else:
             unmapped_reads_percentage = 0
-        current_sample_df.at[file_name_base, 'Unmapped Reads Percentage of Total'] = unmapped_reads_percentage
+        line_items.append(unmapped_reads_percentage)
         # Reference with Coverage
         ref_with_coverage, avg_depth_of_coverage, good_snp_count = process_metrics_file(metrics_file)
-        current_sample_df.at[file_name_base, 'Reference with Coverage'] = ref_with_coverage
-        # Average Depth of Coverage
-        current_sample_df.at[file_name_base, 'Average Depth of Coverage'] = avg_depth_of_coverage
-        # Good SNP Count
-        current_sample_df.at[file_name_base, 'Good SNP Count'] = good_snp_count
-        data_frames.append(current_sample_df)
-    output_df = pandas.concat(data_frames)
-    output_df.to_csv(output_file, sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\')
+        line_items.append(ref_with_coverage)
+        line_items.append(avg_depth_of_coverage)
+        line_items.append(good_snp_count)
+        outfh.write('%s\n' % '\t'.join(str(x) for x in line_items))


 def process_idxstats_file(idxstats_file):
@@ -150,44 +194,17 @@

 parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference dbkey')
 parser.add_argument('--gzipped', action='store_true', dest='gzipped', required=False, default=False, help='Input files are gzipped')
-parser.add_argument('--input_idxstats_dir', action='store', dest='input_idxstats_dir', required=False, default=None, help='Samtools idxstats input directory')
-parser.add_argument('--input_metrics_dir', action='store', dest='input_metrics_dir', required=False, default=None, help='vSNP add zero coverage metrics input directory')
-parser.add_argument('--input_reads_dir', action='store', dest='input_reads_dir', required=False, default=None, help='Samples input directory')
-parser.add_argument('--list_paired', action='store_true', dest='list_paired', required=False, default=False, help='Input samples is a list of paired reads')
 parser.add_argument('--output', action='store', dest='output', help='Output Excel statistics file')
 parser.add_argument('--read1', action='store', dest='read1', help='Required: single read')
 parser.add_argument('--read2', action='store', dest='read2', required=False, default=None, help='Optional: paired read')
 parser.add_argument('--samtools_idxstats', action='store', dest='samtools_idxstats', help='Output of samtools_idxstats')
-parser.add_argument('--vsnp_azc', action='store', dest='vsnp_azc', help='Output of vsnp_add_zero_coverage')
+parser.add_argument('--vsnp_azc_metrics', action='store', dest='vsnp_azc_metrics', help='Output of vsnp_add_zero_coverage')

 args = parser.parse_args()

-fastq_files = []
+stats_list = []
 idxstats_files = []
 metrics_files = []
 # Accumulate inputs.
-if args.read1 is not None:
-    # The inputs are not dataset collections, so
-    # read1, read2 (possibly) and vsnp_azc will also
-    # not be None.
-    fastq_files.append(args.read1)
-    idxstats_files.append(args.samtools_idxstats)
-    metrics_files.append(args.vsnp_azc)
-    if args.read2 is not None:
-        fastq_files.append(args.read2)
-        idxstats_files.append(args.samtools_idxstats)
-        metrics_files.append(args.vsnp_azc)
-else:
-    for file_name in sorted(os.listdir(args.input_reads_dir)):
-        fastq_files.append(os.path.join(args.input_reads_dir, file_name))
-    for file_name in sorted(os.listdir(args.input_idxstats_dir)):
-        idxstats_files.append(os.path.join(args.input_idxstats_dir, file_name))
-        if args.list_paired:
-            # Add the idxstats file for reverse.
-            idxstats_files.append(os.path.join(args.input_idxstats_dir, file_name))
-    for file_name in sorted(os.listdir(args.input_metrics_dir)):
-        metrics_files.append(os.path.join(args.input_metrics_dir, file_name))
-        if args.list_paired:
-            # Add the metrics file for reverse.
-            metrics_files.append(os.path.join(args.input_metrics_dir, file_name))
-output_statistics(fastq_files, idxstats_files, metrics_files, args.output, args.gzipped, args.dbkey)
+read1_stats, read2_stats = accrue_statistics(args.dbkey, args.read1, args.read2, args.gzipped)
+output_statistics(read1_stats, read2_stats, args.samtools_idxstats, args.vsnp_azc_metrics, args.output)
--- a/vsnp_statistics.xml	Thu Jul 22 18:05:22 2021 +0000
+++ b/vsnp_statistics.xml	Mon Aug 02 13:34:09 2021 +0000
@@ -4,122 +4,68 @@
         <import>macros.xml</import>
     </macros>
     <requirements>
-        <requirement type="package" version="1.79">biopython</requirement>
-        <requirement type="package" version="1.21.1">numpy</requirement>
-        <requirement type="package" version="3.0.7">openpyxl</requirement>
-        <requirement type="package" version="1.3.0">pandas</requirement>
-        <requirement type="package" version="2.0.1">xlrd</requirement>
+        <expand macro="biopython_requirement"/>
+        <expand macro="numpy_requirement"/>
+        <expand macro="openpyxl_requirement"/>
+        <expand macro="pandas_requirement"/>
+        <expand macro="xlrd_requirement"/>
     </requirements>
     <command detect_errors="exit_code"><![CDATA[
 #import re
-#set input_idxstats_dir = 'input_idxstats'
-#set input_metrics_dir = 'input_metrics'
-#set input_reads_dir = 'input_reads'
-mkdir -p $input_idxstats_dir &&
-mkdir -p $input_metrics_dir &&
-mkdir -p $input_reads_dir &&

-#if $input_type_cond.input_type  == 'single_files':
-    #set read1 = $input_type_cond.read_type_cond.read1
+#if $input_type_cond.input_type in ["single", "pair"]:
+    #set read1 = $input_type_cond.read1
     #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.element_identifier))
     ln -s '${read1}' '${read1_identifier}' &&
-    #if $input_type_cond.read_type_cond.read_type == 'pair':
-        #set read2 = $input_type_cond.read_type_cond.read2
+    #if $input_type_cond.input_type == "pair":
+        #set read2 = $input_type_cond.read2
         #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier))
         ln -s '${read2}' '${read2_identifier}' &&
     #else:
-        #set read2 = None
+        #set read2 = None
     #end if
 #else:
-    #if $input_type_cond.collection_type_cond.collection_type == 'single':
-        #for $i in $input_type_cond.collection_type_cond.reads_collection:
-            #set identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
-            ln -s '${i.file_name}' '$input_reads_dir/${identifier}' &&
-        #end for
-    #else:
-        #set read1 = $input_type_cond.collection_type_cond.reads_collection['forward']
-        #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.name))
-        ln -s '${read1}' '$input_reads_dir/${read1_identifier}' &&
-        #set read2 = $input_type_cond.collection_type_cond.reads_collection['reverse']
-        #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.name))
-        ln -s '${read2}' '$input_reads_dir/${read2_identifier}' &&
-    #end if
-    #for $i in $input_type_cond.samtools_idxstats:
-        #set identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
-        ln -s '${i.file_name}' '$input_idxstats_dir/${identifier}' &&
-    #end for
-    #for $i in $input_type_cond.vsnp_azc:
-        #set identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
-        ln -s '${i.file_name}' '$input_metrics_dir/${identifier}' &&
-    #end for
+    #set read1 = $input_type_cond.reads_collection['forward']
+    #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.name))
+    ln -s '${read1}' '${read1_identifier}' &&
+    #set read2 = $input_type_cond.reads_collection['reverse']
+    #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.name))
+    ln -s '${read2}' '${read2_identifier}' &&
 #end if

 python '$__tool_directory__/vsnp_statistics.py'
-#if $input_type_cond.input_type == 'single_files':
-    --dbkey '$input_type_cond.samtools_idxstats.metadata.dbkey'
-    #if $input_type_cond.read_type_cond.read1.is_of_type('fastqsanger.gz'):
-        --gzipped
-    #end if
-    --read1 '${read1_identifier}'
-    #if $input_type_cond.read_type_cond.read_type == 'pair':
-      --read2 '${read2_identifier}'
-    #end if
-    --samtools_idxstats '$input_type_cond.samtools_idxstats'
-    --vsnp_azc '$input_type_cond.vsnp_azc'
-#else:
-    --dbkey '$input_type_cond.samtools_idxstats[0].metadata.dbkey'
-    #if $input_type_cond.collection_type_cond.reads_collection[0].is_of_type('fastqsanger.gz'):
-        --gzipped
-    #end if
-    #if $input_type_cond.collection_type_cond.collection_type == 'paired':
-        --list_paired
-    #end if
-    --input_idxstats_dir '$input_idxstats_dir'
-    --input_metrics_dir '$input_metrics_dir'
-    --input_reads_dir '$input_reads_dir'
+--read1 '${read1_identifier}'
+#if $read2 is not None
+  --read2 '${read2_identifier}'
 #end if
+#if $read1.is_of_type('fastqsanger.gz'):
+    --gzipped
+#end if
+--dbkey '$samtools_idxstats.metadata.dbkey'
+--samtools_idxstats '$samtools_idxstats'
+--vsnp_azc_metrics '$vsnp_azc_metrics'
 --output '$output'
 ]]></command>
     <inputs>
         <conditional name="input_type_cond">
             <param name="input_type" type="select" label="Choose the category of the files to be analyzed">
-                <option value="single_files" selected="true">Single files</option>
-                <option value="collections">Collections of files</option>
+                <option value="single" selected="true">Single files</option>
+                <option value="paired">Paired reads</option>
+                <option value="pair">Paired reads in separate data sets</option>
             </param>
-            <when value="single_files">
-                <conditional name="read_type_cond">
-                    <param name="read_type" type="select" label="Choose the read type">
-                        <option value="single" selected="true">Single reads</option>
-                         <option value="pair">Paired reads</option>
-                    </param>
-                     <when value="single">
-                        <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
-                    </when>
-                    <when value="pair">
-                        <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
-                        <param name="read2" type="data" format="fastqsanger.gz,fastqsanger" label="Read2 fastq file"/>
-                    </when>
-                </conditional>
-                <param name="samtools_idxstats" type="data" format="tabular" label="Samtools idxstats file"/>
-                <param name="vsnp_azc" type="data" format="tabular" label="vSNP: add zero coverage metrics file"/>
+            <when value="single">
+                <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
             </when>
-            <when value="collections">
-                <conditional name="collection_type_cond">
-                    <param name="collection_type" type="select" label="Collections of single reads or paired reads?">
-                        <option value="single" selected="true">Single reads</option>
-                        <option value="paired">Paired reads in separate datasets</option>
-                    </param>
-                    <when value="single">
-                        <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="list" label="Collection of fastqsanger files"/>
-                    </when>
-                    <when value="paired">
-                        <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="paired" label="Collection of fastqsanger paired read files"/>
-                    </when>
-                </conditional>
-                <param name="samtools_idxstats" type="data_collection" format="tabular" collection_type="list" label="Collection of samtools idxstats files"/>
-                <param name="vsnp_azc" type="data_collection" format="tabular" collection_type="list" label="Collection of vSNP: add zero coverage metrics files"/>
+            <when value="paired">
+                <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="paired" label="Collection of fastqsanger paired read files"/>
+            </when>
+            <when value="pair">
+                <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
+                <param name="read2" type="data" format="fastqsanger.gz,fastqsanger" label="Read2 fastq file"/>
             </when>
         </conditional>
+        <param name="samtools_idxstats" type="data" format="tabular" label="Samtools idxstats file"/>
+        <param name="vsnp_azc_metrics" type="data" format="tabular" label="vSNP: add zero coverage metrics file"/>
     </inputs>
     <outputs>
         <data name="output" format="tabular"/>
@@ -127,67 +73,32 @@
     <tests>
         <!-- A single fastq file -->
         <test expect_num_outputs="1">
-            <param name="input_type" value="single_files"/>
-            <param name="read_type" value="single"/>
+            <param name="input_type" value="single"/>
             <param name="read1" value="Mcap_Deer_DE_SRR650221.fastq.gz" ftype="fastqsanger.gz" dbkey="89"/>
             <param name="samtools_idxstats" value="samtools_idxstats1.tabular" ftype="tabular" dbkey="89"/>
-            <param name="vsnp_azc" value="add_zc_metrics1.tabular" ftype="tabular" dbkey="89"/>
+            <param name="vsnp_azc_metrics" value="add_zc_metrics1.tabular" ftype="tabular" dbkey="89"/>
             <output name="output" file="vsnp_statistics1.tabular" ftype="tabular"/>
         </test>
         <!-- A set of paired fastq files -->
         <test expect_num_outputs="1">
-            <param name="input_type" value="single_files"/>
-            <param name="read_type" value="pair"/>
+            <param name="input_type" value="pair"/>
             <param name="read1" value="13-1941-6_S4_L001_R1_600000.fastq.gz" ftype="fastqsanger.gz" dbkey="89"/>
             <param name="read2" value="13-1941-6_S4_L001_R2_600000.fastq.gz" ftype="fastqsanger.gz" dbkey="89"/>
             <param name="samtools_idxstats" value="samtools_idxstats2.tabular" ftype="tabular" dbkey="89"/>
-            <param name="vsnp_azc" value="add_zc_metrics2.tabular" ftype="tabular" dbkey="89"/>
+            <param name="vsnp_azc_metrics" value="add_zc_metrics2.tabular" ftype="tabular" dbkey="89"/>
             <output name="output" file="vsnp_statistics2.tabular" ftype="tabular"/>
         </test>
-        <!-- A collection of SE fastq files -->
+        <!-- A collection of paired fastq files -->
         <test expect_num_outputs="1">
-            <param name="input_type" value="collections"/>
-            <param name="read_type" value="single"/>
-            <param name="reads_collection">
-                <collection type="list">
-                    <element name="Mcap_Deer_DE_SRR650221.fastq.gz" value="Mcap_Deer_DE_SRR650221.fastq.gz" dbkey="89"/>
-                    <element name="13-1941-6_S4_L001_R1_600000.fastq.gz" value="13-1941-6_S4_L001_R1_600000.fastq.gz" dbkey="89"/>
-                </collection>
-            </param>
-            <param name="samtools_idxstats">
-                <collection type="list">
-                    <element name="13-1941-6_S4_L001_R1_600000.fastq.gz" value="samtools_idxstats3.tabular" dbkey="89"/>
-                    <element name="Mcap_Deer_DE_SRR650221.fastq.gz" value="samtools_idxstats4.tabular" dbkey="89"/>
-                </collection>
-            </param>
-            <param name="vsnp_azc">
-                <collection type="list">
-                    <element name="13-1941-6_S4_L001_R1_600000.fastq.gz" value="add_zc_metrics3.tabular" dbkey="89"/>
-                    <element name="Mcap_Deer_DE_SRR650221.fastq.gz" value="add_zc_metrics4.tabular" dbkey="89"/>
-                </collection>
-            </param>
-            <output name="output" file="vsnp_statistics3.tabular" ftype="tabular"/>
-        </test>
-        <!-- A collection of PE fastq files -->
-        <test expect_num_outputs="1">
-            <param name="input_type" value="collections"/>
-            <param name="collection_type" value="paired"/>
+            <param name="input_type" value="paired"/>
             <param name="reads_collection">
                 <collection type="paired">
                     <element name="forward" value="13-1941-6_S4_L001_R1_600000.fastq.gz" ftype="fastqsanger.gz"/>
                     <element name="reverse" value="13-1941-6_S4_L001_R2_600000.fastq.gz" ftype="fastqsanger.gz"/>
                 </collection>
             </param>
-            <param name="samtools_idxstats">
-                <collection type="list">
-                    <element name="13-1941-6_S4_L001_R1_600000.fastq" value="samtools_idxstats5.tabular" dbkey="89"/>
-                </collection>
-            </param>
-            <param name="vsnp_azc">
-                <collection type="list">
-                    <element name="13-1941-6_S4_L001_R1_600000.fastq" value="add_zc_metrics5.tabular" dbkey="89"/>
-                </collection>
-            </param>
+            <param name="samtools_idxstats" value="samtools_idxstats5.tabular" ftype="tabular" dbkey="89"/>
+            <param name="vsnp_azc_metrics" value="add_zc_metrics5.tabular" ftype="tabular" dbkey="89"/>
             <output name="output" file="vsnp_statistics4.tabular" ftype="tabular"/>
         </test>
     </tests>
@@ -195,8 +106,8 @@
 **What it does**

 Accepts associated fastq files, SAMtools idxstats files and **vSNP: add zero coverage** metrics files and extracts information from them
-to produce an Excel spreadsheet containing statistics for each sample.  The samples can be single or paired reads, and all associated inputs
-can be either single files or collections of files.  The output statistics include reference, file size, mean read length, mean read quality,
+to produce an Excel spreadsheet containing statistics for each sample.  The samples can be a single read, a single set of paired reads in
+separate datasets or  collection of paired reads.  The output statistics include reference, file size, mean read length, mean read quality,
 reads passing Q30, total reads, all mapped reads, unmapped reads, unmapped reads percentage of total, reference with coverage, average depth
 of coverage and good SNP count.
     </help>