comparison vsnp_statistics.py @ 4:2d6c6b01319e draft

Uploaded
author greg
date Sun, 03 Jan 2021 15:47:28 +0000
parents 321a8259e3f9
children d0fbdeaaa488
comparison
equal deleted inserted replaced
3:321a8259e3f9 4:2d6c6b01319e
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 import argparse 3 import argparse
4 import gzip 4 import gzip
5 import os
6 import shutil
7
5 import numpy 8 import numpy
6 import os
7 import pandas 9 import pandas
8 import shutil 10
9
10 INPUT_IDXSTATS_DIR = 'input_idxstats'
11 INPUT_METRICS_DIR = 'input_metrics'
12 INPUT_READS_DIR = 'input_reads'
13 QUALITYKEY = {'!': '0', '"': '1', '#': '2', '$': '3', '%': '4', '&': '5', "'": '6', '(': '7', 11 QUALITYKEY = {'!': '0', '"': '1', '#': '2', '$': '3', '%': '4', '&': '5', "'": '6', '(': '7',
14 ')': '8', '*': '9', '+': '10', ',': '11', '-': '12', '.': '13', '/': '14', '0': '15', 12 ')': '8', '*': '9', '+': '10', ',': '11', '-': '12', '.': '13', '/': '14', '0': '15',
15 '1': '16', '2': '17', '3': '18', '4': '19', '5': '20', '6': '21', '7': '22', 13 '1': '16', '2': '17', '3': '18', '4': '19', '5': '20', '6': '21', '7': '22',
16 '8': '23', '9': '24', ':': '25', ';': '26', '<': '27', '=': '28', '>': '29', 14 '8': '23', '9': '24', ':': '25', ';': '26', '<': '27', '=': '28', '>': '29',
17 '?': '30', '@': '31', 'A': '32', 'B': '33', 'C': '34', 'D': '35', 'E': '36', 15 '?': '30', '@': '31', 'A': '32', 'B': '33', 'C': '34', 'D': '35', 'E': '36',
24 's': '1', 't': '1', 'u': '1', 'v': '1', 'w': '1', 'x': '1', 'y': '1', 'z': '1', 22 's': '1', 't': '1', 'u': '1', 'v': '1', 'w': '1', 'x': '1', 'y': '1', 'z': '1',
25 ' ': '1'} 23 ' ': '1'}
26 24
27 25
28 def fastq_to_df(fastq_file, gzipped): 26 def fastq_to_df(fastq_file, gzipped):
29 if gzipped.lower() == "true": 27 if gzipped:
30 return pandas.read_csv(gzip.open(fastq_file, "r"), header=None, sep="^") 28 return pandas.read_csv(gzip.open(fastq_file, "r"), header=None, sep="^")
31 else: 29 return pandas.read_csv(open(fastq_file, "r"), header=None, sep="^")
32 return pandas.read_csv(open(fastq_file, "r"), header=None, sep="^")
33
34
35 def get_base_file_name(file_path):
36 base_file_name = os.path.basename(file_path)
37 if base_file_name.find(".") > 0:
38 # Eliminate the extension.
39 return os.path.splitext(base_file_name)[0]
40 elif base_file_name.find("_") > 0:
41 # The dot extension was likely changed to
42 # the " character.
43 items = base_file_name.split("_")
44 return "_".join(items[0:-1])
45 else:
46 return base_file_name
47 30
48 31
49 def nice_size(size): 32 def nice_size(size):
50 # Returns a readably formatted string with the size 33 # Returns a readably formatted string with the size
51 words = ['bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB'] 34 words = ['bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB']
65 return "%s%d bytes" % (prefix, size) 48 return "%s%d bytes" % (prefix, size)
66 return "%s%.1f %s" % (prefix, size, word) 49 return "%s%.1f %s" % (prefix, size, word)
67 return '??? bytes' 50 return '??? bytes'
68 51
69 52
70 def output_statistics(reads_files, idxstats_files, metrics_files, output_file, gzipped, dbkey): 53 def output_statistics(fastq_files, idxstats_files, metrics_files, output_file, gzipped, dbkey):
71 # Produce an Excel spreadsheet that 54 # Produce an Excel spreadsheet that
72 # contains a row for each sample. 55 # contains a row for each sample.
73 columns = ['Reference', 'File Size', 'Mean Read Length', 'Mean Read Quality', 'Reads Passing Q30', 56 columns = ['Reference', 'File Size', 'Mean Read Length', 'Mean Read Quality', 'Reads Passing Q30',
74 'Total Reads', 'All Mapped Reads', 'Unmapped Reads', 'Unmapped Reads Percentage of Total', 57 'Total Reads', 'All Mapped Reads', 'Unmapped Reads', 'Unmapped Reads Percentage of Total',
75 'Reference with Coverage', 'Average Depth of Coverage', 'Good SNP Count'] 58 'Reference with Coverage', 'Average Depth of Coverage', 'Good SNP Count']
76 data_frames = [] 59 data_frames = []
77 for i, fastq_file in enumerate(reads_files): 60 for i, fastq_file in enumerate(fastq_files):
78 idxstats_file = idxstats_files[i] 61 idxstats_file = idxstats_files[i]
79 metrics_file = metrics_files[i] 62 metrics_file = metrics_files[i]
80 file_name_base = os.path.basename(fastq_file) 63 file_name_base = os.path.basename(fastq_file)
81 # Read fastq_file into a data frame. 64 # Read fastq_file into a data frame.
82 fastq_df = fastq_to_df(fastq_file, gzipped) 65 fastq_df = fastq_to_df(fastq_file, gzipped)
169 # VCFfilter 611 152 # VCFfilter 611
170 good_snp_count = items[1] 153 good_snp_count = items[1]
171 return ref_with_coverage, avg_depth_of_coverage, good_snp_count 154 return ref_with_coverage, avg_depth_of_coverage, good_snp_count
172 155
173 156
174 if __name__ == '__main__': 157 parser = argparse.ArgumentParser()
175 parser = argparse.ArgumentParser() 158
176 159 parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference dbkey')
177 parser.add_argument('--read1', action='store', dest='read1', required=False, default=None, help='Required: single read') 160 parser.add_argument('--gzipped', action='store_true', dest='gzipped', required=False, default=False, help='Input files are gzipped')
178 parser.add_argument('--read2', action='store', dest='read2', required=False, default=None, help='Optional: paired read') 161 parser.add_argument('--input_idxstats_dir', action='store', dest='input_idxstats_dir', required=False, default=None, help='Samtools idxstats input directory')
179 parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference dbkey') 162 parser.add_argument('--input_metrics_dir', action='store', dest='input_metrics_dir', required=False, default=None, help='vSNP add zero coverage metrics input directory')
180 parser.add_argument('--gzipped', action='store', dest='gzipped', help='Input files are gzipped') 163 parser.add_argument('--input_reads_dir', action='store', dest='input_reads_dir', required=False, default=None, help='Samples input directory')
181 parser.add_argument('--samtools_idxstats', action='store', dest='samtools_idxstats', required=False, default=None, help='Output of samtools_idxstats') 164 parser.add_argument('--list_paired', action='store_true', dest='list_paired', required=False, default=False, help='Input samples is a list of paired reads')
182 parser.add_argument('--output', action='store', dest='output', help='Output Excel statistics file') 165 parser.add_argument('--output', action='store', dest='output', help='Output Excel statistics file')
183 parser.add_argument('--vsnp_azc', action='store', dest='vsnp_azc', required=False, default=None, help='Output of vsnp_add_zero_coverage') 166 parser.add_argument('--read1', action='store', dest='read1', help='Required: single read')
184 167 parser.add_argument('--read2', action='store', dest='read2', required=False, default=None, help='Optional: paired read')
185 args = parser.parse_args() 168 parser.add_argument('--samtools_idxstats', action='store', dest='samtools_idxstats', help='Output of samtools_idxstats')
186 print("args:\n%s\n" % str(args)) 169 parser.add_argument('--vsnp_azc', action='store', dest='vsnp_azc', help='Output of vsnp_add_zero_coverage')
187 170
188 reads_files = [] 171 args = parser.parse_args()
189 idxstats_files = [] 172
190 metrics_files = [] 173 fastq_files = []
191 # Accumulate inputs. 174 idxstats_files = []
192 if args.read1 is not None: 175 metrics_files = []
193 # The inputs are not dataset collections, so 176 # Accumulate inputs.
194 # read1, read2 (possibly) and vsnp_azc will also 177 if args.read1 is not None:
195 # not be None. 178 # The inputs are not dataset collections, so
196 reads_files.append(args.read1) 179 # read1, read2 (possibly) and vsnp_azc will also
180 # not be None.
181 fastq_files.append(args.read1)
182 idxstats_files.append(args.samtools_idxstats)
183 metrics_files.append(args.vsnp_azc)
184 if args.read2 is not None:
185 fastq_files.append(args.read2)
197 idxstats_files.append(args.samtools_idxstats) 186 idxstats_files.append(args.samtools_idxstats)
198 metrics_files.append(args.vsnp_azc) 187 metrics_files.append(args.vsnp_azc)
199 if args.read2 is not None: 188 else:
200 reads_files.append(args.read2) 189 for file_name in sorted(os.listdir(args.input_reads_dir)):
201 idxstats_files.append(args.samtools_idxstats) 190 fastq_files.append(os.path.join(args.input_reads_dir, file_name))
202 metrics_files.append(args.vsnp_azc) 191 for file_name in sorted(os.listdir(args.input_idxstats_dir)):
203 else: 192 idxstats_files.append(os.path.join(args.input_idxstats_dir, file_name))
204 for file_name in sorted(os.listdir(INPUT_READS_DIR)): 193 if args.list_paired:
205 file_path = os.path.abspath(os.path.join(INPUT_READS_DIR, file_name)) 194 # Add the idxstats file for reverse.
206 reads_files.append(file_path) 195 idxstats_files.append(os.path.join(args.input_idxstats_dir, file_name))
207 base_file_name = get_base_file_name(file_path) 196 for file_name in sorted(os.listdir(args.input_metrics_dir)):
208 for file_name in sorted(os.listdir(INPUT_IDXSTATS_DIR)): 197 metrics_files.append(os.path.join(args.input_metrics_dir, file_name))
209 file_path = os.path.abspath(os.path.join(INPUT_IDXSTATS_DIR, file_name)) 198 if args.list_paired:
210 idxstats_files.append(file_path) 199 # Add the metrics file for reverse.
211 for file_name in sorted(os.listdir(INPUT_METRICS_DIR)): 200 metrics_files.append(os.path.join(args.input_metrics_dir, file_name))
212 file_path = os.path.abspath(os.path.join(INPUT_METRICS_DIR, file_name)) 201 output_statistics(fastq_files, idxstats_files, metrics_files, args.output, args.gzipped, args.dbkey)
213 metrics_files.append(file_path)
214 output_statistics(reads_files, idxstats_files, metrics_files, args.output, args.gzipped, args.dbkey)