Mercurial > repos > greg > vsnp_statistics

--- a/test-data/vsnp_statistics.tabular	Tue Apr 21 10:19:53 2020 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,22 +0,0 @@
-Sample	13-1941-6
-Reference	89
-Read1 File	13-1941-6_S4_L001_R1_600000_fastq_gz
-Read1 File Size	5.1 KB
-Read1 Total Reads	25
-Read1 Mean Read Length	230.6
-Read1 Mean Read Quality	30.8
-Read1 Reads Passing Q30	60.0%
-Read2 File	13-1941-6_S4_L001_R2_600000_fastq_gz
-Read2 File Size	5.6 KB
-Read2 Total Reads	25
-Read2 Mean Read Length	239.7
-Read2 Mean Read Quality	22.2
-Read2 Reads Passing Q30	4.0%
-Total Reads	50
-All Mapped Reads	45
-Unmapped Reads	5
-Unmapped Reads Percentage of Total	10.0%
-Average Depth of Coverage	10.338671
-Reference with Coverage	98.74%
-
-Good SNP Count	611
Binary file test-data/vsnp_statistics.xlsx has changed
--- a/vsnp_statistics.py	Tue Apr 21 10:19:53 2020 -0400
+++ b/vsnp_statistics.py	Wed Apr 29 16:56:10 2020 -0400
@@ -2,19 +2,22 @@

 import argparse
 import gzip
-import multiprocessing
 import numpy
 import os
 import pandas
-import queue
+import shutil

 INPUT_IDXSTATS_DIR = 'input_idxstats'
 INPUT_METRICS_DIR = 'input_metrics'
 INPUT_READS_DIR = 'input_reads'
-OUTPUT_DIR = 'output'
 QUALITYKEY = {'!':'0', '"':'1', '#':'2', '$':'3', '%':'4', '&':'5', "'":'6', '(':'7', ')':'8', '*':'9', '+':'10', ',':'11', '-':'12', '.':'13', '/':'14', '0':'15', '1':'16', '2':'17', '3':'18', '4':'19', '5':'20', '6':'21', '7':'22', '8':'23', '9':'24', ':':'25', ';':'26', '<':'27', '=':'28', '>':'29', '?':'30', '@':'31', 'A':'32', 'B':'33', 'C':'34', 'D':'35', 'E':'36', 'F':'37', 'G':'38', 'H':'39', 'I':'40', 'J':'41', 'K':'42', 'L':'43', 'M':'44', 'N':'45', 'O':'46', 'P':'47', 'Q':'48', 'R':'49', 'S':'50', 'T':'51', 'U':'52', 'V':'53', 'W':'54', 'X':'55', 'Y':'56', 'Z':'57', '_':'1', ']':'1', '[':'1', '\\':'1', '\n':'1', '`':'1', 'a':'1', 'b':'1', 'c':'1', 'd':'1', 'e':'1', 'f':'1', 'g':'1', 'h':'1', 'i':'1', 'j':'1', 'k':'1', 'l':'1', 'm':'1', 'n':'1', 'o':'1', 'p':'1', 'q':'1', 'r':'1', 's':'1', 't':'1', 'u':'1', 'v':'1', 'w':'1', 'x':'1', 'y':'1', 'z':'1', ' ':'1'}
-READCOLUMNS = ['Sample', 'Reference', 'Fastq File', 'Size', 'Total Reads', 'Mean Read Length', 'Mean Read Quality', 'Reads Passing Q30']
-SEP = "\t"
+
+
+def fastq_to_df(fastq_file, gzipped):
+    if gzipped.lower() == "true":
+        return pandas.read_csv(gzip.open(fastq_file, "r"), header=None, sep="^")
+    else:
+        return pandas.read_csv(open(fastq_file, "r"), header=None, sep="^")


 def get_base_file_name(file_path):
@@ -52,121 +55,108 @@
     return '??? bytes'


-def output_read_stats(gzipped, fastq_file, ofh, sampling_number=10000, output_sample=False, dbkey=None, collection=False):
-    file_name_base = os.path.basename(fastq_file)
-    # Output a 2-column file where column 1 is
-    # the labels and column 2 is the values.
-    if output_sample:
-        # The Sample and Reference columns should be
-        # output only once, so this block handles
-        # paired reads, where the columns are not
-        # output for Read2.
-        try:
-            # Illumina read file names are something like:
-            # 13-1941-6_S4_L001_R1_600000_fastq_gz
-            sample = file_name_base.split("_")[0]
-        except Exception:
-            sample = ""
-        # Sample
-        ofh.write("Sample%s%s\n" % (SEP, sample))
-        ofh.write("Reference%s%s\n" % (SEP, dbkey))
-        if collection:
-            read = "Read"
+def output_statistics(reads_files, idxstats_files, metrics_files, output_file, gzipped, dbkey):
+    # Produce an Excel spreadsheet that
+    # contains a row for each sample.
+    columns = ['Reference', 'File Size', 'Mean Read Length', 'Mean Read Quality', 'Reads Passing Q30',
+               'Total Reads', 'All Mapped Reads', 'Unmapped Reads', 'Unmapped Reads Percentage of Total',
+               'Reference with Coverage', 'Average Depth of Coverage', 'Good SNP Count']
+    data_frames = []
+    for i, fastq_file in enumerate(reads_files):
+        idxstats_file = idxstats_files[i]
+        metrics_file = metrics_files[i]
+        file_name_base = os.path.basename(fastq_file)
+        # Read fastq_file into a data frame.
+        fastq_df = fastq_to_df(fastq_file, gzipped)
+        total_reads = int(len(fastq_df.index) / 4)
+        current_sample_df = pandas.DataFrame(index=[file_name_base], columns=columns)
+        # Reference
+        current_sample_df.at[file_name_base, 'Reference'] = dbkey
+        # File Size
+        current_sample_df.at[file_name_base, 'File Size'] = nice_size(os.path.getsize(fastq_file))
+        # Mean Read Length
+        sampling_size = 10000
+        if sampling_size > total_reads:
+            sampling_size = total_reads
+        fastq_df = fastq_df.iloc[3::4].sample(sampling_size)
+        dict_mean = {}
+        list_length = []
+        for index, row in fastq_df.iterrows():
+            base_qualities = []
+            for base in list(row.array[0]):
+                base_qualities.append(int(QUALITYKEY[base]))
+            dict_mean[index] = numpy.mean(base_qualities)
+            list_length.append(len(row.array[0]))
+        current_sample_df.at[file_name_base, 'Mean Read Length'] = "%.1f" % numpy.mean(list_length)
+        # Mean Read Quality
+        df_mean = pandas.DataFrame.from_dict(dict_mean, orient='index', columns=['ave'])
+        current_sample_df.at[file_name_base, 'Mean Read Quality'] = "%.1f" % df_mean['ave'].mean()
+        # Reads Passing Q30
+        reads_gt_q30 = len(df_mean[df_mean['ave'] >= 30])
+        reads_passing_q30 = "{:10.2f}".format(reads_gt_q30 / sampling_size)
+        current_sample_df.at[file_name_base, 'Reads Passing Q30'] = reads_passing_q30
+        # Total Reads
+        current_sample_df.at[file_name_base, 'Total Reads'] = total_reads
+        # All Mapped Reads
+        all_mapped_reads, unmapped_reads = process_idxstats_file(idxstats_file)
+        current_sample_df.at[file_name_base, 'All Mapped Reads'] = all_mapped_reads
+        # Unmapped Reads
+        current_sample_df.at[file_name_base, 'Unmapped Reads'] = unmapped_reads
+        # Unmapped Reads Percentage of Total
+        if unmapped_reads > 0:
+            unmapped_reads_percentage = "{:10.2f}".format(unmapped_reads / total_reads)
         else:
-            read = "Read1"
-    else:
-        read = "Read2"
-    # Read
-    ofh.write("%s File%s%s\n" % (read, SEP, file_name_base))
-    # File Size
-    ofh.write("%s File Size%s%s\n" % (read, SEP, nice_size(os.path.getsize(fastq_file))))
-    if gzipped.lower() == "true":
-        df = pandas.read_csv(gzip.open(fastq_file, "r"), header=None, sep="^")
-    else:
-        df = pandas.read_csv(open(fastq_file, "r"), header=None, sep="^")
-    total_read_count = int(len(df.index) / 4)
-    # Readx Total Reads
-    ofh.write("%s Total Reads%s%s\n" % (read, SEP, total_read_count))
-    # Mean Read Length
-    sampling_size = int(sampling_number)
-    if sampling_size > total_read_count:
-        sampling_size = total_read_count
-    df = df.iloc[3::4].sample(sampling_size)
-    dict_mean = {}
-    list_length = []
-    for index, row in df.iterrows():
-        base_qualities = []
-        for base in list(row.array[0]):
-            base_qualities.append(int(QUALITYKEY[base]))
-        dict_mean[index] = numpy.mean(base_qualities)
-        list_length.append(len(row.array[0]))
-    ofh.write("%s Mean Read Length%s%s\n" % (read, SEP, "%.1f" % numpy.mean(list_length)))
-    # Mean Read Quality
-    df_mean = pandas.DataFrame.from_dict(dict_mean, orient='index', columns=['ave'])
-    ofh.write("%s Mean Read Quality%s%s\n" % (read, SEP, "%.1f" % df_mean['ave'].mean()))
-    # Reads Passing Q30
-    reads_gt_q30 = len(df_mean[df_mean['ave'] >= 30])
-    reads_passing_q30 = "{:10.2f}".format(reads_gt_q30 / sampling_size)
-    ofh.write("%s reads passing Q30%s%s\n" % (read, SEP, reads_passing_q30))
-    return total_read_count
+            unmapped_reads_percentage = 0
+        current_sample_df.at[file_name_base, 'Unmapped Reads Percentage of Total'] = unmapped_reads_percentage
+        # Reference with Coverage
+        ref_with_coverage, avg_depth_of_coverage, good_snp_count = process_metrics_file(metrics_file)
+        current_sample_df.at[file_name_base, 'Reference with Coverage'] = ref_with_coverage
+        # Average Depth of Coverage
+        current_sample_df.at[file_name_base, 'Average Depth of Coverage'] = avg_depth_of_coverage
+        # Good SNP Count
+        current_sample_df.at[file_name_base, 'Good SNP Count'] = good_snp_count
+        data_frames.append(current_sample_df)
+    excel_df = pandas.concat(data_frames)
+    excel_file_name = "output.xlsx"
+    writer = pandas.ExcelWriter(excel_file_name, engine='xlsxwriter')
+    excel_df.to_excel(writer, sheet_name='Sheet1')
+    writer.save()
+    shutil.move(excel_file_name, output_file)


-def output_statistics(task_queue, read2, collection, gzipped, dbkey, timeout):
-    while True:
-        try:
-            tup = task_queue.get(block=True, timeout=timeout)
-        except queue.Empty:
-            break
-        read_file, idxstats_file, metrics_file, output_file = tup
-        total_reads = 0
-        with open(output_file, "w") as ofh:
-            total_reads += output_read_stats(gzipped, read_file, ofh, output_sample=True, dbkey=dbkey, collection=collection)
-            if read2 is not None:
-                total_reads += output_read_stats(gzipped, read2, ofh)
-            ofh.write("Total Reads%s%d\n" % (SEP, total_reads))
-            with open(idxstats_file, "r") as ifh:
-                unmapped_reads = 0
-                for i, line in enumerate(ifh):
-                    items = line.split("\t")
-                    if i == 0:
-                        # NC_002945.4 4349904 213570 4047
-                        ofh.write("All Mapped Reads%s%s\n" % (SEP, items[2]))
-                    elif i == 1:
-                        # * 0 0 82774
-                        unmapped_reads = int(items[3])
-                        ofh.write("Unmapped Reads%s%d\n" % (SEP, unmapped_reads))
-                percent_str = "Unmapped Reads Percentage of Total"
-                if unmapped_reads > 0:
-                    unmapped_reads_percentage = "{:10.2f}".format(unmapped_reads / total_reads)
-                    ofh.write("%s%s%s\n" % (percent_str, SEP, unmapped_reads_percentage))
-                else:
-                    ofh.write("%s%s0\n" % (percent_str, SEP))
-            with open(metrics_file, "r") as ifh:
-                for i, line in enumerate(ifh):
-                    if i == 0:
-                        # Skip comments.
-                        continue
-                    items = line.split("\t")
-                    if i == 1:
-                        # MarkDuplicates 10.338671 98.74%
-                        ofh.write("Average Depth of Coverage%s%s\n" % (SEP, items[2]))
-                        ofh.write("Reference with Coverage%s%s\n" % (SEP, items[3]))
-                    elif i == 2:
-                        # VCFfilter 611
-                        ofh.write("Good SNP Count%s%s\n" % (SEP, items[1]))
-        task_queue.task_done()
+def process_idxstats_file(idxstats_file):
+    all_mapped_reads = 0
+    unmapped_reads = 0
+    with open(idxstats_file, "r") as fh:
+        for i, line in enumerate(fh):
+            items = line.split("\t")
+            if i == 0:
+                # NC_002945.4 4349904 213570 4047
+                all_mapped_reads = int(items[2])
+            elif i == 1:
+                # * 0 0 82774
+                unmapped_reads = int(items[3])
+    return all_mapped_reads, unmapped_reads


-def set_num_cpus(num_files, processes):
-    num_cpus = int(multiprocessing.cpu_count())
-    if num_files < num_cpus and num_files < processes:
-        return num_files
-    if num_cpus < processes:
-        half_cpus = int(num_cpus / 2)
-        if num_files < half_cpus:
-            return num_files
-        return half_cpus
-    return processes
+def process_metrics_file(metrics_file):
+    ref_with_coverage = '0%'
+    avg_depth_of_coverage = 0
+    good_snp_count = 0
+    with open(metrics_file, "r") as ifh:
+        for i, line in enumerate(ifh):
+            if i == 0:
+                # Skip comments.
+                continue
+            items = line.split("\t")
+            if i == 1:
+                # MarkDuplicates 10.338671 98.74%
+                ref_with_coverage = items[3]
+                avg_depth_of_coverage = items[2]
+            elif i == 2:
+                # VCFfilter 611
+                good_snp_count = items[1]
+    return ref_with_coverage, avg_depth_of_coverage, good_snp_count


 if __name__ == '__main__':
@@ -177,60 +167,36 @@
     parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference dbkey')
     parser.add_argument('--gzipped', action='store', dest='gzipped', help='Input files are gzipped')
     parser.add_argument('--samtools_idxstats', action='store', dest='samtools_idxstats', required=False, default=None, help='Output of samtools_idxstats')
-    parser.add_argument('--output', action='store', dest='output', required=False, default=None, help='Output statisticsfile')
+    parser.add_argument('--output', action='store', dest='output', help='Output Excel statistics file')
     parser.add_argument('--vsnp_azc', action='store', dest='vsnp_azc', required=False, default=None, help='Output of vsnp_add_zero_coverage')
-    parser.add_argument('--processes', action='store', dest='processes', type=int, help='User-selected number of processes to use for job splitting')

     args = parser.parse_args()
+    print("args:\n%s\n" % str(args))

     reads_files = []
     idxstats_files = []
     metrics_files = []
-    output_files = []
-    if args.output is not None:
-        # The inputs were not dataset collections, so
+    # Accumulate inputs.
+    if args.read1 is not None:
+        # The inputs are not dataset collections, so
         # read1, read2 (possibly) and vsnp_azc will also
         # not be None.
-        collection = False
         reads_files.append(args.read1)
         idxstats_files.append(args.samtools_idxstats)
         metrics_files.append(args.vsnp_azc)
-        output_files.append(args.output)
+        if args.read2 is not None:
+            reads_files.append(args.read2)
+            idxstats_files.append(args.samtools_idxstats)
+            metrics_files.append(args.vsnp_azc)
     else:
-        collection = True
         for file_name in sorted(os.listdir(INPUT_READS_DIR)):
             file_path = os.path.abspath(os.path.join(INPUT_READS_DIR, file_name))
             reads_files.append(file_path)
             base_file_name = get_base_file_name(file_path)
-            output_files.append(os.path.abspath(os.path.join(OUTPUT_DIR, "%s.tabular" % base_file_name)))
         for file_name in sorted(os.listdir(INPUT_IDXSTATS_DIR)):
             file_path = os.path.abspath(os.path.join(INPUT_IDXSTATS_DIR, file_name))
             idxstats_files.append(file_path)
         for file_name in sorted(os.listdir(INPUT_METRICS_DIR)):
             file_path = os.path.abspath(os.path.join(INPUT_METRICS_DIR, file_name))
             metrics_files.append(file_path)
-
-    multiprocessing.set_start_method('spawn')
-    queue1 = multiprocessing.JoinableQueue()
-    num_files = len(output_files)
-    cpus = set_num_cpus(num_files, args.processes)
-    # Set a timeout for get()s in the queue.
-    timeout = 0.05
-
-    for i, output_file in enumerate(output_files):
-        read_file = reads_files[i]
-        idxstats_file = idxstats_files[i]
-        metrics_file = metrics_files[i]
-        queue1.put((read_file, idxstats_file, metrics_file, output_file))
-
-    # Complete the output_statistics task.
-    processes = [multiprocessing.Process(target=output_statistics, args=(queue1, args.read2, collection, args.gzipped, args.dbkey, timeout, )) for _ in range(cpus)]
-    for p in processes:
-        p.start()
-    for p in processes:
-        p.join()
-    queue1.join()
-
-    if queue1.empty():
-        queue1.close()
-        queue1.join_thread()
+    output_statistics(reads_files, idxstats_files, metrics_files, args.output, args.gzipped, args.dbkey)
--- a/vsnp_statistics.xml	Tue Apr 21 10:19:53 2020 -0400
+++ b/vsnp_statistics.xml	Wed Apr 29 16:56:10 2020 -0400
@@ -4,6 +4,8 @@
         <requirement type="package" version="0.5.1">humanize</requirement>
         <requirement type="package" version="1.16.5">numpy</requirement>
         <requirement type="package" version="0.25.3">pandas</requirement>
+        <requirement type="package" version="1.2.0">xlrd</requirement>
+        <requirement type="package" version="1.2.8">xlsxwriter</requirement>
     </requirements>
     <command detect_errors="exit_code"><![CDATA[
 #import os
@@ -13,11 +15,9 @@
 #set input_idxstats_dir = 'input_idxstats'
 #set input_metrics_dir = 'input_metrics'
 #set input_reads_dir = 'input_reads'
-#set output_dir = 'output'
 mkdir -p $input_idxstats_dir &&
 mkdir -p $input_metrics_dir &&
 mkdir -p $input_reads_dir &&
-mkdir -p $output_dir &&
 #if str($input_type) == "single":
     #set read_type_cond = $input_type_cond.read_type_cond
     #set read1 = $read_type_cond.read1
@@ -44,22 +44,21 @@
         #end if
         #set filename = $i.file_name
         #set identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
-        ln -s $filename $input_reads_dir/$identifier &&
+        ln -s '$filename' '$input_reads_dir/$identifier' &&
     #end for
     #for $i in $input_type_cond.samtools_idxstats_collection:
         #set filename = $i.file_name
         #set identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
-        ln -s $filename $input_idxstats_dir/$identifier &&
+        ln -s '$filename' '$input_idxstats_dir/$identifier' &&
     #end for
     #for $i in $input_type_cond.azc_metrics_collection:
         #set dbkey = $i.metadata.dbkey
         #set filename = $i.file_name
         #set identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
-        ln -s $filename $input_metrics_dir/$identifier &&
+        ln -s '$filename' '$input_metrics_dir/$identifier' &&
     #end for
 #end if
 python '$__tool_directory__/vsnp_statistics.py'
---processes $processes
 --dbkey '$dbkey'
 --gzipped '$gzipped'
 #if str($input_type) == "single":
@@ -71,8 +70,8 @@
     #end if
     --samtools_idxstats '$samtools_idxstats'
     --vsnp_azc '$vsnp_azc'
-    --output '$output'
 #end if
+--output '$output'
 ]]></command>
     <inputs>
         <conditional name="input_type_cond">
@@ -107,16 +106,9 @@
                 <param name="azc_metrics_collection" type="data_collection" format="tabular" collection_type="list" label="Collection of vSNP zero-coverage metrics files"/>
             </when>
         </conditional>
-        <param name="processes" type="integer" min="1" max="20" value="8" label="Number of processes for job splitting"/>
     </inputs>
     <outputs>
-        <data name="output" format="tabular">
-            <filter>input_type_cond['input_type'] == 'single'</filter>
-        </data>
-        <collection name="output_collection" type="list">
-            <discover_datasets pattern="__name__" directory="output" format="tabular" />
-            <filter>input_type_cond['input_type'] == 'collection'</filter>
-        </collection>
+        <data name="output" format="xlsx"/>
     </outputs>
     <tests>
         <test>
@@ -124,7 +116,7 @@
             <param name="read2" value="13-1941-6_S4_L001_R2_600000.fastq.gz" ftype="fastqsanger.gz" dbkey="89"/>
             <param name="samtools_idxstats" value="samtools_idxstats.tabular" ftype="tabular" dbkey="89"/>
             <param name="vsnp_azc" value="add_zc_metrics.tabular" ftype="tabular" dbkey="89"/>
-            <output name="output" file="vsnp_statistics.tabular" ftype="tabular" compare="contains"/>
+            <output name="output" file="vsnp_statistics.xlsx" ftype="xlsx" compare="sim_size"/>
         </test>
     </tests>
     <help>
@@ -138,7 +130,6 @@
 **Required options**

  * **Choose the type for files to be analyzed** - select "Single files" or "Collections of files", then select the appropriate history items (single or paired fastqsanger reads or collections of fastqsanger reads and associated idxstats and vSNP zero coverage metrics files) based on the selected option..
- * **Number of processes for job splitting** - Select the number of processes for splitting the job to shorten execution time.
     </help>
     <citations>
         <citation type="bibtex">