Mercurial > repos > greg > vsnp_statistics
comparison vsnp_statistics.py @ 4:2d6c6b01319e draft
Uploaded
| author | greg |
|---|---|
| date | Sun, 03 Jan 2021 15:47:28 +0000 |
| parents | 321a8259e3f9 |
| children | d0fbdeaaa488 |
comparison
equal
deleted
inserted
replaced
| 3:321a8259e3f9 | 4:2d6c6b01319e |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 | 2 |
| 3 import argparse | 3 import argparse |
| 4 import gzip | 4 import gzip |
| 5 import os | |
| 6 import shutil | |
| 7 | |
| 5 import numpy | 8 import numpy |
| 6 import os | |
| 7 import pandas | 9 import pandas |
| 8 import shutil | 10 |
| 9 | |
| 10 INPUT_IDXSTATS_DIR = 'input_idxstats' | |
| 11 INPUT_METRICS_DIR = 'input_metrics' | |
| 12 INPUT_READS_DIR = 'input_reads' | |
| 13 QUALITYKEY = {'!': '0', '"': '1', '#': '2', '$': '3', '%': '4', '&': '5', "'": '6', '(': '7', | 11 QUALITYKEY = {'!': '0', '"': '1', '#': '2', '$': '3', '%': '4', '&': '5', "'": '6', '(': '7', |
| 14 ')': '8', '*': '9', '+': '10', ',': '11', '-': '12', '.': '13', '/': '14', '0': '15', | 12 ')': '8', '*': '9', '+': '10', ',': '11', '-': '12', '.': '13', '/': '14', '0': '15', |
| 15 '1': '16', '2': '17', '3': '18', '4': '19', '5': '20', '6': '21', '7': '22', | 13 '1': '16', '2': '17', '3': '18', '4': '19', '5': '20', '6': '21', '7': '22', |
| 16 '8': '23', '9': '24', ':': '25', ';': '26', '<': '27', '=': '28', '>': '29', | 14 '8': '23', '9': '24', ':': '25', ';': '26', '<': '27', '=': '28', '>': '29', |
| 17 '?': '30', '@': '31', 'A': '32', 'B': '33', 'C': '34', 'D': '35', 'E': '36', | 15 '?': '30', '@': '31', 'A': '32', 'B': '33', 'C': '34', 'D': '35', 'E': '36', |
| 24 's': '1', 't': '1', 'u': '1', 'v': '1', 'w': '1', 'x': '1', 'y': '1', 'z': '1', | 22 's': '1', 't': '1', 'u': '1', 'v': '1', 'w': '1', 'x': '1', 'y': '1', 'z': '1', |
| 25 ' ': '1'} | 23 ' ': '1'} |
| 26 | 24 |
| 27 | 25 |
| 28 def fastq_to_df(fastq_file, gzipped): | 26 def fastq_to_df(fastq_file, gzipped): |
| 29 if gzipped.lower() == "true": | 27 if gzipped: |
| 30 return pandas.read_csv(gzip.open(fastq_file, "r"), header=None, sep="^") | 28 return pandas.read_csv(gzip.open(fastq_file, "r"), header=None, sep="^") |
| 31 else: | 29 return pandas.read_csv(open(fastq_file, "r"), header=None, sep="^") |
| 32 return pandas.read_csv(open(fastq_file, "r"), header=None, sep="^") | |
| 33 | |
| 34 | |
| 35 def get_base_file_name(file_path): | |
| 36 base_file_name = os.path.basename(file_path) | |
| 37 if base_file_name.find(".") > 0: | |
| 38 # Eliminate the extension. | |
| 39 return os.path.splitext(base_file_name)[0] | |
| 40 elif base_file_name.find("_") > 0: | |
| 41 # The dot extension was likely changed to | |
| 42 # the " character. | |
| 43 items = base_file_name.split("_") | |
| 44 return "_".join(items[0:-1]) | |
| 45 else: | |
| 46 return base_file_name | |
| 47 | 30 |
| 48 | 31 |
| 49 def nice_size(size): | 32 def nice_size(size): |
| 50 # Returns a readably formatted string with the size | 33 # Returns a readably formatted string with the size |
| 51 words = ['bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB'] | 34 words = ['bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB'] |
| 65 return "%s%d bytes" % (prefix, size) | 48 return "%s%d bytes" % (prefix, size) |
| 66 return "%s%.1f %s" % (prefix, size, word) | 49 return "%s%.1f %s" % (prefix, size, word) |
| 67 return '??? bytes' | 50 return '??? bytes' |
| 68 | 51 |
| 69 | 52 |
| 70 def output_statistics(reads_files, idxstats_files, metrics_files, output_file, gzipped, dbkey): | 53 def output_statistics(fastq_files, idxstats_files, metrics_files, output_file, gzipped, dbkey): |
| 71 # Produce an Excel spreadsheet that | 54 # Produce an Excel spreadsheet that |
| 72 # contains a row for each sample. | 55 # contains a row for each sample. |
| 73 columns = ['Reference', 'File Size', 'Mean Read Length', 'Mean Read Quality', 'Reads Passing Q30', | 56 columns = ['Reference', 'File Size', 'Mean Read Length', 'Mean Read Quality', 'Reads Passing Q30', |
| 74 'Total Reads', 'All Mapped Reads', 'Unmapped Reads', 'Unmapped Reads Percentage of Total', | 57 'Total Reads', 'All Mapped Reads', 'Unmapped Reads', 'Unmapped Reads Percentage of Total', |
| 75 'Reference with Coverage', 'Average Depth of Coverage', 'Good SNP Count'] | 58 'Reference with Coverage', 'Average Depth of Coverage', 'Good SNP Count'] |
| 76 data_frames = [] | 59 data_frames = [] |
| 77 for i, fastq_file in enumerate(reads_files): | 60 for i, fastq_file in enumerate(fastq_files): |
| 78 idxstats_file = idxstats_files[i] | 61 idxstats_file = idxstats_files[i] |
| 79 metrics_file = metrics_files[i] | 62 metrics_file = metrics_files[i] |
| 80 file_name_base = os.path.basename(fastq_file) | 63 file_name_base = os.path.basename(fastq_file) |
| 81 # Read fastq_file into a data frame. | 64 # Read fastq_file into a data frame. |
| 82 fastq_df = fastq_to_df(fastq_file, gzipped) | 65 fastq_df = fastq_to_df(fastq_file, gzipped) |
| 169 # VCFfilter 611 | 152 # VCFfilter 611 |
| 170 good_snp_count = items[1] | 153 good_snp_count = items[1] |
| 171 return ref_with_coverage, avg_depth_of_coverage, good_snp_count | 154 return ref_with_coverage, avg_depth_of_coverage, good_snp_count |
| 172 | 155 |
| 173 | 156 |
| 174 if __name__ == '__main__': | 157 parser = argparse.ArgumentParser() |
| 175 parser = argparse.ArgumentParser() | 158 |
| 176 | 159 parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference dbkey') |
| 177 parser.add_argument('--read1', action='store', dest='read1', required=False, default=None, help='Required: single read') | 160 parser.add_argument('--gzipped', action='store_true', dest='gzipped', required=False, default=False, help='Input files are gzipped') |
| 178 parser.add_argument('--read2', action='store', dest='read2', required=False, default=None, help='Optional: paired read') | 161 parser.add_argument('--input_idxstats_dir', action='store', dest='input_idxstats_dir', required=False, default=None, help='Samtools idxstats input directory') |
| 179 parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference dbkey') | 162 parser.add_argument('--input_metrics_dir', action='store', dest='input_metrics_dir', required=False, default=None, help='vSNP add zero coverage metrics input directory') |
| 180 parser.add_argument('--gzipped', action='store', dest='gzipped', help='Input files are gzipped') | 163 parser.add_argument('--input_reads_dir', action='store', dest='input_reads_dir', required=False, default=None, help='Samples input directory') |
| 181 parser.add_argument('--samtools_idxstats', action='store', dest='samtools_idxstats', required=False, default=None, help='Output of samtools_idxstats') | 164 parser.add_argument('--list_paired', action='store_true', dest='list_paired', required=False, default=False, help='Input samples is a list of paired reads') |
| 182 parser.add_argument('--output', action='store', dest='output', help='Output Excel statistics file') | 165 parser.add_argument('--output', action='store', dest='output', help='Output Excel statistics file') |
| 183 parser.add_argument('--vsnp_azc', action='store', dest='vsnp_azc', required=False, default=None, help='Output of vsnp_add_zero_coverage') | 166 parser.add_argument('--read1', action='store', dest='read1', help='Required: single read') |
| 184 | 167 parser.add_argument('--read2', action='store', dest='read2', required=False, default=None, help='Optional: paired read') |
| 185 args = parser.parse_args() | 168 parser.add_argument('--samtools_idxstats', action='store', dest='samtools_idxstats', help='Output of samtools_idxstats') |
| 186 print("args:\n%s\n" % str(args)) | 169 parser.add_argument('--vsnp_azc', action='store', dest='vsnp_azc', help='Output of vsnp_add_zero_coverage') |
| 187 | 170 |
| 188 reads_files = [] | 171 args = parser.parse_args() |
| 189 idxstats_files = [] | 172 |
| 190 metrics_files = [] | 173 fastq_files = [] |
| 191 # Accumulate inputs. | 174 idxstats_files = [] |
| 192 if args.read1 is not None: | 175 metrics_files = [] |
| 193 # The inputs are not dataset collections, so | 176 # Accumulate inputs. |
| 194 # read1, read2 (possibly) and vsnp_azc will also | 177 if args.read1 is not None: |
| 195 # not be None. | 178 # The inputs are not dataset collections, so |
| 196 reads_files.append(args.read1) | 179 # read1, read2 (possibly) and vsnp_azc will also |
| 180 # not be None. | |
| 181 fastq_files.append(args.read1) | |
| 182 idxstats_files.append(args.samtools_idxstats) | |
| 183 metrics_files.append(args.vsnp_azc) | |
| 184 if args.read2 is not None: | |
| 185 fastq_files.append(args.read2) | |
| 197 idxstats_files.append(args.samtools_idxstats) | 186 idxstats_files.append(args.samtools_idxstats) |
| 198 metrics_files.append(args.vsnp_azc) | 187 metrics_files.append(args.vsnp_azc) |
| 199 if args.read2 is not None: | 188 else: |
| 200 reads_files.append(args.read2) | 189 for file_name in sorted(os.listdir(args.input_reads_dir)): |
| 201 idxstats_files.append(args.samtools_idxstats) | 190 fastq_files.append(os.path.join(args.input_reads_dir, file_name)) |
| 202 metrics_files.append(args.vsnp_azc) | 191 for file_name in sorted(os.listdir(args.input_idxstats_dir)): |
| 203 else: | 192 idxstats_files.append(os.path.join(args.input_idxstats_dir, file_name)) |
| 204 for file_name in sorted(os.listdir(INPUT_READS_DIR)): | 193 if args.list_paired: |
| 205 file_path = os.path.abspath(os.path.join(INPUT_READS_DIR, file_name)) | 194 # Add the idxstats file for reverse. |
| 206 reads_files.append(file_path) | 195 idxstats_files.append(os.path.join(args.input_idxstats_dir, file_name)) |
| 207 base_file_name = get_base_file_name(file_path) | 196 for file_name in sorted(os.listdir(args.input_metrics_dir)): |
| 208 for file_name in sorted(os.listdir(INPUT_IDXSTATS_DIR)): | 197 metrics_files.append(os.path.join(args.input_metrics_dir, file_name)) |
| 209 file_path = os.path.abspath(os.path.join(INPUT_IDXSTATS_DIR, file_name)) | 198 if args.list_paired: |
| 210 idxstats_files.append(file_path) | 199 # Add the metrics file for reverse. |
| 211 for file_name in sorted(os.listdir(INPUT_METRICS_DIR)): | 200 metrics_files.append(os.path.join(args.input_metrics_dir, file_name)) |
| 212 file_path = os.path.abspath(os.path.join(INPUT_METRICS_DIR, file_name)) | 201 output_statistics(fastq_files, idxstats_files, metrics_files, args.output, args.gzipped, args.dbkey) |
| 213 metrics_files.append(file_path) | |
| 214 output_statistics(reads_files, idxstats_files, metrics_files, args.output, args.gzipped, args.dbkey) |
