# HG changeset patch # User iuc # Date 1637093576 0 # Node ID a8560decb495aa61b35f530fc93ea32c22591ad0 # Parent e3016c6c5994a7a03b653573b42b1bf6052626f3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 2a94c64d6c7236550bf483d2ffc4e86248c63aab" diff -r e3016c6c5994 -r a8560decb495 macros.xml --- a/macros.xml Tue Nov 16 08:28:27 2021 +0000 +++ b/macros.xml Tue Nov 16 20:12:56 2021 +0000 @@ -1,7 +1,35 @@ - 1.0 - 19.09 + 1.0 + 2 + 21.01 + + biopython + + + numpy + + + openpyxl + + + pandas + + + pysam + + + pyvcf + + + pyyaml + + + xlrd + + + xlsxwriter + diff -r e3016c6c5994 -r a8560decb495 test-data/Mbovis-01D6_cascade_table.xlsx Binary file test-data/Mbovis-01D6_cascade_table.xlsx has changed diff -r e3016c6c5994 -r a8560decb495 test-data/Mbovis-01D6_sort_table.xlsx Binary file test-data/Mbovis-01D6_sort_table.xlsx has changed diff -r e3016c6c5994 -r a8560decb495 test-data/Mbovis-01D_cascade_table.xlsx Binary file test-data/Mbovis-01D_cascade_table.xlsx has changed diff -r e3016c6c5994 -r a8560decb495 test-data/Mbovis-01D_sort_table.xlsx Binary file test-data/Mbovis-01D_sort_table.xlsx has changed diff -r e3016c6c5994 -r a8560decb495 test-data/Mbovis-01_cascade_table.xlsx Binary file test-data/Mbovis-01_cascade_table.xlsx has changed diff -r e3016c6c5994 -r a8560decb495 test-data/Mbovis-01_sort_table.xlsx Binary file test-data/Mbovis-01_sort_table.xlsx has changed diff -r e3016c6c5994 -r a8560decb495 test-data/cascade_table.xlsx Binary file test-data/cascade_table.xlsx has changed diff -r e3016c6c5994 -r a8560decb495 test-data/output_metrics.tabular --- a/test-data/output_metrics.tabular Tue Nov 16 08:28:27 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3 +0,0 @@ -# File Number of Good SNPs Average Coverage Genome Coverage -vcf_input_vcf 0.659602 50.49% -vcf_input_vcf 0 diff -r e3016c6c5994 -r a8560decb495 test-data/output_vcf.vcf --- a/test-data/output_vcf.vcf Tue Nov 16 08:28:27 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,100 +0,0 @@ -##fileformat=VCFv4.2 -##fileDate=20200302 -##source=freeBayes v1.3.1-dirty -##reference=/home/galaxy/galaxy/tool-data/AF2122/seq/AF2122.fa -##contig= -##phasing=none -##commandline="freebayes --region NC_002945.4:0..4349904 --bam b_0.bam --fasta-reference /home/galaxy/galaxy/tool-data/AF2122/seq/AF2122.fa --vcf ./vcf_output/part_NC_002945.4:0..4349904.vcf -u -n 0 --haplotype-length -1 --min-repeat-size 5 --min-repeat-entropy 1 -m 1 -q 0 -R 0 -Y 0 -e 1 -F 0.05 -C 2 -G 1 --min-alternate-qsum 0" -##filter="QUAL > 0" -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 13-1941-6 -NC_002945.4 1 . N . . . . GT ./. -NC_002945.4 2 . N . . . . GT ./. -NC_002945.4 3 . N . . . . GT ./. -NC_002945.4 4 . N . . . . GT ./. -NC_002945.4 5 . N . . . . GT ./. -NC_002945.4 6 . N . . . . GT ./. -NC_002945.4 7 . N . . . . GT ./. -NC_002945.4 8 . N . . . . GT ./. -NC_002945.4 9 . N . . . . GT ./. -NC_002945.4 10 . N . . . . GT ./. -NC_002945.4 11 . N . . . . GT ./. -NC_002945.4 12 . N . . . . GT ./. -NC_002945.4 13 . N . . . . GT ./. -NC_002945.4 14 . N . . . . GT ./. -NC_002945.4 15 . N . . . . GT ./. -NC_002945.4 16 . N . . . . GT ./. -NC_002945.4 17 . N . . . . GT ./. -NC_002945.4 18 . N . . . . GT ./. -NC_002945.4 19 . N . . . . GT ./. -NC_002945.4 20 . N . . . . GT ./. -NC_002945.4 21 . N . . . . GT ./. -NC_002945.4 22 . N . . . . GT ./. -NC_002945.4 23 . N . . . . GT ./. -NC_002945.4 24 . N . . . . GT ./. -NC_002945.4 25 . N . . . . GT ./. -NC_002945.4 26 . N . . . . GT ./. -NC_002945.4 27 . N . . . . GT ./. -NC_002945.4 28 . N . . . . GT ./. -NC_002945.4 29 . N . . . . GT ./. -NC_002945.4 30 . N . . . . GT ./. -NC_002945.4 31 . N . . . . GT ./. -NC_002945.4 32 . N . . . . GT ./. -NC_002945.4 33 . N . . . . GT ./. -NC_002945.4 34 . N . . . . GT ./. -NC_002945.4 35 . N . . . . GT ./. -NC_002945.4 36 . N . . . . GT ./. -NC_002945.4 37 . N . . . . GT ./. diff -r e3016c6c5994 -r a8560decb495 test-data/sort_table.xlsx Binary file test-data/sort_table.xlsx has changed diff -r e3016c6c5994 -r a8560decb495 test-data/vsnp_statistics1.tabular --- a/test-data/vsnp_statistics1.tabular Tue Nov 16 08:28:27 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ - Reference File Size Mean Read Length Mean Read Quality Reads Passing Q30 Total Reads All Mapped Reads Unmapped Reads Unmapped Reads Percentage of Total Reference with Coverage Average Depth of Coverage Good SNP Count -Mcap_Deer_DE_SRR650221_fastq_gz 89 1.6 MB 121.0 29.7 0.53 4317 17063 223 0.05 8.27% 0.439436 36 diff -r e3016c6c5994 -r a8560decb495 test-data/vsnp_statistics2.tabular --- a/test-data/vsnp_statistics2.tabular Tue Nov 16 08:28:27 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3 +0,0 @@ - Reference File Size Mean Read Length Mean Read Quality Reads Passing Q30 Total Reads All Mapped Reads Unmapped Reads Unmapped Reads Percentage of Total Reference with Coverage Average Depth of Coverage Good SNP Count -13-1941-6_S4_L001_R1_600000_fastq_gz 89 8.7 KB 100.0 65.7 1.00 25 45 5 0.20 98.74% 10.338671 611 -13-1941-6_S4_L001_R2_600000_fastq_gz 89 8.5 KB 100.0 66.3 1.00 25 45 5 0.20 98.74% 10.338671 611 diff -r e3016c6c5994 -r a8560decb495 test-data/vsnp_statistics3.tabular --- a/test-data/vsnp_statistics3.tabular Tue Nov 16 08:28:27 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3 +0,0 @@ - Reference File Size Mean Read Length Mean Read Quality Reads Passing Q30 Total Reads All Mapped Reads Unmapped Reads Unmapped Reads Percentage of Total Reference with Coverage Average Depth of Coverage Good SNP Count -13-1941-6_S4_L001_R1_600000_fastq_gz 89 8.7 KB 100.0 65.7 1.00 25 24 2 0.08 0.13% 0.001252 0 -Mcap_Deer_DE_SRR650221_fastq_gz 89 1.6 MB 121.0 29.7 0.53 4317 17063 223 0.05 8.27% 0.439436 36 diff -r e3016c6c5994 -r a8560decb495 test-data/vsnp_statistics4.tabular --- a/test-data/vsnp_statistics4.tabular Tue Nov 16 08:28:27 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3 +0,0 @@ - Reference File Size Mean Read Length Mean Read Quality Reads Passing Q30 Total Reads All Mapped Reads Unmapped Reads Unmapped Reads Percentage of Total Reference with Coverage Average Depth of Coverage Good SNP Count -13-1941-6_S4_L001_R1_600000_fastq_gz 89 8.7 KB 100.0 65.7 1.00 25 46 4 0.16 0.16% 0.002146 0 -13-1941-6_S4_L001_R2_600000_fastq_gz 89 8.5 KB 100.0 66.3 1.00 25 46 4 0.16 0.16% 0.002146 0 diff -r e3016c6c5994 -r a8560decb495 vsnp_build_tables.py --- a/vsnp_build_tables.py Tue Nov 16 08:28:27 2021 +0000 +++ b/vsnp_build_tables.py Tue Nov 16 20:12:56 2021 +0000 @@ -1,7 +1,9 @@ #!/usr/bin/env python import argparse +import multiprocessing import os +import queue import re import pandas @@ -16,6 +18,9 @@ # to use LibreOffice for Excel spreadsheets. MAXCOLS = 1024 OUTPUT_EXCEL_DIR = 'output_excel_dir' +INPUT_JSON_AVG_MQ_DIR = 'input_json_avg_mq_dir' +INPUT_JSON_DIR = 'input_json_dir' +INPUT_NEWICK_DIR = 'input_newick_dir' def annotate_table(table_df, group, annotation_dict): @@ -221,74 +226,94 @@ output_excel(df, type_str, group_str, annotation_dict) -def preprocess_tables(newick_file, json_file, json_avg_mq_file, annotation_dict): - avg_mq_series = pandas.read_json(json_avg_mq_file, typ='series', orient='split') - # Map quality to dataframe. - mqdf = avg_mq_series.to_frame(name='MQ') - mqdf = mqdf.T - # Get the group. - group = get_sample_name(newick_file) - snps_df = pandas.read_json(json_file, orient='split') - with open(newick_file, 'r') as fh: - for line in fh: - line = re.sub('[:,]', '\n', line) - line = re.sub('[)(]', '', line) - line = re.sub(r'[0-9].*\.[0-9].*\n', '', line) - line = re.sub('root\n', '', line) - sample_order = line.split('\n') - sample_order = list([_f for _f in sample_order if _f]) - sample_order.insert(0, 'root') - tree_order = snps_df.loc[sample_order] - # Count number of SNPs in each column. - snp_per_column = [] - for column_header in tree_order: - count = 0 - column = tree_order[column_header] - for element in column: - if element != column[0]: - count = count + 1 - snp_per_column.append(count) - row1 = pandas.Series(snp_per_column, tree_order.columns, name="snp_per_column") - # Count number of SNPS from the - # top of each column in the table. - snp_from_top = [] - for column_header in tree_order: - count = 0 - column = tree_order[column_header] - # for each element in the column - # skip the first element - for element in column[1:]: - if element == column[0]: - count = count + 1 - else: - break - snp_from_top.append(count) - row2 = pandas.Series(snp_from_top, tree_order.columns, name="snp_from_top") - tree_order = tree_order.append([row1]) - tree_order = tree_order.append([row2]) - # In pandas=0.18.1 even this does not work: - # abc = row1.to_frame() - # abc = abc.T --> tree_order.shape (5, 18), abc.shape (1, 18) - # tree_order.append(abc) - # Continue to get error: "*** ValueError: all the input arrays must have same number of dimensions" - tree_order = tree_order.T - tree_order = tree_order.sort_values(['snp_from_top', 'snp_per_column'], ascending=[True, False]) - tree_order = tree_order.T - # Remove snp_per_column and snp_from_top rows. - cascade_order = tree_order[:-2] - # Output the cascade table. - output_cascade_table(cascade_order, mqdf, group, annotation_dict) - # Output the sorted table. - output_sort_table(cascade_order, mqdf, group, annotation_dict) +def preprocess_tables(task_queue, annotation_dict, timeout): + while True: + try: + tup = task_queue.get(block=True, timeout=timeout) + except queue.Empty: + break + newick_file, json_file, json_avg_mq_file = tup + avg_mq_series = pandas.read_json(json_avg_mq_file, typ='series', orient='split') + # Map quality to dataframe. + mqdf = avg_mq_series.to_frame(name='MQ') + mqdf = mqdf.T + # Get the group. + group = get_sample_name(newick_file) + snps_df = pandas.read_json(json_file, orient='split') + with open(newick_file, 'r') as fh: + for line in fh: + line = re.sub('[:,]', '\n', line) + line = re.sub('[)(]', '', line) + line = re.sub(r'[0-9].*\.[0-9].*\n', '', line) + line = re.sub('root\n', '', line) + sample_order = line.split('\n') + sample_order = list([_f for _f in sample_order if _f]) + sample_order.insert(0, 'root') + tree_order = snps_df.loc[sample_order] + # Count number of SNPs in each column. + snp_per_column = [] + for column_header in tree_order: + count = 0 + column = tree_order[column_header] + for element in column: + if element != column[0]: + count = count + 1 + snp_per_column.append(count) + row1 = pandas.Series(snp_per_column, tree_order.columns, name="snp_per_column") + # Count number of SNPS from the + # top of each column in the table. + snp_from_top = [] + for column_header in tree_order: + count = 0 + column = tree_order[column_header] + # for each element in the column + # skip the first element + for element in column[1:]: + if element == column[0]: + count = count + 1 + else: + break + snp_from_top.append(count) + row2 = pandas.Series(snp_from_top, tree_order.columns, name="snp_from_top") + tree_order = tree_order.append([row1]) + tree_order = tree_order.append([row2]) + # In pandas=0.18.1 even this does not work: + # abc = row1.to_frame() + # abc = abc.T --> tree_order.shape (5, 18), abc.shape (1, 18) + # tree_order.append(abc) + # Continue to get error: "*** ValueError: all the input arrays must have same number of dimensions" + tree_order = tree_order.T + tree_order = tree_order.sort_values(['snp_from_top', 'snp_per_column'], ascending=[True, False]) + tree_order = tree_order.T + # Remove snp_per_column and snp_from_top rows. + cascade_order = tree_order[:-2] + # Output the cascade table. + output_cascade_table(cascade_order, mqdf, group, annotation_dict) + # Output the sorted table. + output_sort_table(cascade_order, mqdf, group, annotation_dict) + task_queue.task_done() + + +def set_num_cpus(num_files, processes): + num_cpus = int(multiprocessing.cpu_count()) + if num_files < num_cpus and num_files < processes: + return num_files + if num_cpus < processes: + half_cpus = int(num_cpus / 2) + if num_files < half_cpus: + return num_files + return half_cpus + return processes if __name__ == '__main__': parser = argparse.ArgumentParser() + parser.add_argument('--input_avg_mq_json', action='store', dest='input_avg_mq_json', required=False, default=None, help='Average MQ json file') + parser.add_argument('--input_newick', action='store', dest='input_newick', required=False, default=None, help='Newick file') + parser.add_argument('--input_snps_json', action='store', dest='input_snps_json', required=False, default=None, help='SNPs json file') parser.add_argument('--gbk_file', action='store', dest='gbk_file', required=False, default=None, help='Optional gbk file'), - parser.add_argument('--input_avg_mq_json', action='store', dest='input_avg_mq_json', help='Average MQ json file') - parser.add_argument('--input_newick', action='store', dest='input_newick', help='Newick file') - parser.add_argument('--input_snps_json', action='store', dest='input_snps_json', help='SNPs json file') + parser.add_argument('--processes', action='store', dest='processes', type=int, help='User-selected number of processes to use for job splitting') args = parser.parse_args() @@ -299,4 +324,56 @@ else: annotation_dict = None - preprocess_tables(args.input_newick, args.input_snps_json, args.input_avg_mq_json, annotation_dict) + # The assumption here is that the list of files + # in both INPUT_NEWICK_DIR and INPUT_JSON_DIR are + # named such that they are properly matched if + # the directories contain more than 1 file (i.e., + # hopefully the newick file names and json file names + # will be something like Mbovis-01D6_* so they can be + # sorted and properly associated with each other). + if args.input_newick is not None: + newick_files = [args.input_newick] + else: + newick_files = [] + for file_name in sorted(os.listdir(INPUT_NEWICK_DIR)): + file_path = os.path.abspath(os.path.join(INPUT_NEWICK_DIR, file_name)) + newick_files.append(file_path) + if args.input_snps_json is not None: + json_files = [args.input_snps_json] + else: + json_files = [] + for file_name in sorted(os.listdir(INPUT_JSON_DIR)): + file_path = os.path.abspath(os.path.join(INPUT_JSON_DIR, file_name)) + json_files.append(file_path) + if args.input_avg_mq_json is not None: + json_avg_mq_files = [args.input_avg_mq_json] + else: + json_avg_mq_files = [] + for file_name in sorted(os.listdir(INPUT_JSON_AVG_MQ_DIR)): + file_path = os.path.abspath(os.path.join(INPUT_JSON_AVG_MQ_DIR, file_name)) + json_avg_mq_files.append(file_path) + + multiprocessing.set_start_method('spawn') + queue1 = multiprocessing.JoinableQueue() + queue2 = multiprocessing.JoinableQueue() + num_files = len(newick_files) + cpus = set_num_cpus(num_files, args.processes) + # Set a timeout for get()s in the queue. + timeout = 0.05 + + for i, newick_file in enumerate(newick_files): + json_file = json_files[i] + json_avg_mq_file = json_avg_mq_files[i] + queue1.put((newick_file, json_file, json_avg_mq_file)) + + # Complete the preprocess_tables task. + processes = [multiprocessing.Process(target=preprocess_tables, args=(queue1, annotation_dict, timeout, )) for _ in range(cpus)] + for p in processes: + p.start() + for p in processes: + p.join() + queue1.join() + + if queue1.empty(): + queue1.close() + queue1.join_thread() diff -r e3016c6c5994 -r a8560decb495 vsnp_determine_ref_from_data.xml --- a/vsnp_determine_ref_from_data.xml Tue Nov 16 08:28:27 2021 +0000 +++ b/vsnp_determine_ref_from_data.xml Tue Nov 16 20:12:56 2021 +0000 @@ -1,11 +1,11 @@ - + from input data macros.xml - biopython - pyyaml + + total_reads: - sampling_size = total_reads +def get_statistics(dbkey, fastq_file, gzipped): + sampling_size = 10000 + # Read fastq_file into a data fram to + # get the phred quality scores. + _open = partial(gzip.open, mode='rt') if gzipped else open + with _open(fastq_file) as fh: + identifiers = [] + seqs = [] + letter_annotations = [] + for seq_record in SeqIO.parse(fh, "fastq"): + identifiers.append(seq_record.id) + seqs.append(seq_record.seq) + letter_annotations.append(seq_record.letter_annotations["phred_quality"]) + # Convert lists to Pandas series. + s1 = pandas.Series(identifiers, name='id') + s2 = pandas.Series(seqs, name='seq') + # Gather Series into a data frame. + fastq_df = pandas.DataFrame(dict(id=s1, seq=s2)).set_index(['id']) + # Starting at row 3, keep every 4 row + # random sample specified number of rows. + file_size = nice_size(os.path.getsize(fastq_file)) + total_reads = len(seqs) + # Mean Read Length + if sampling_size > total_reads: + sampling_size = total_reads + try: fastq_df = fastq_df.iloc[3::4].sample(sampling_size) - dict_mean = {} - list_length = [] - i = 0 - for id, seq, in fastq_df.iterrows(): - dict_mean[id] = numpy.mean(letter_annotations[i]) - list_length.append(len(seq.array[0])) - i += 1 - current_sample_df.at[file_name_base, 'Mean Read Length'] = '%.1f' % numpy.mean(list_length) - # Mean Read Quality - df_mean = pandas.DataFrame.from_dict(dict_mean, orient='index', columns=['ave']) - current_sample_df.at[file_name_base, 'Mean Read Quality'] = '%.1f' % df_mean['ave'].mean() - # Reads Passing Q30 - reads_gt_q30 = len(df_mean[df_mean['ave'] >= 30]) - reads_passing_q30 = '{:10.2f}'.format(reads_gt_q30 / sampling_size) - current_sample_df.at[file_name_base, 'Reads Passing Q30'] = reads_passing_q30 + except ValueError: + fastq_df = fastq_df.iloc[3::4].sample(sampling_size, replace=True) + dict_mean = {} + list_length = [] + i = 0 + for id, seq, in fastq_df.iterrows(): + dict_mean[id] = numpy.mean(letter_annotations[i]) + list_length.append(len(seq.array[0])) + i += 1 + mean_read_length = '%.1f' % numpy.mean(list_length) + # Mean Read Quality + df_mean = pandas.DataFrame.from_dict(dict_mean, orient='index', columns=['ave']) + mean_read_quality = '%.1f' % df_mean['ave'].mean() + # Reads Passing Q30 + reads_gt_q30 = len(df_mean[df_mean['ave'] >= 30]) + reads_passing_q30 = '{:10.2f}'.format(reads_gt_q30 / sampling_size) + stats = Statistics(dbkey, os.path.basename(fastq_file), file_size, total_reads, mean_read_length, + mean_read_quality, reads_passing_q30) + return stats + + +def accrue_statistics(dbkey, read1, read2, gzipped): + read1_stats = get_statistics(dbkey, read1, gzipped) + if read2 is None: + read2_stats = None + else: + read2_stats = get_statistics(dbkey, read2, gzipped) + return read1_stats, read2_stats + + +def output_statistics(read1_stats, read2_stats, idxstats_file, metrics_file, output_file): + paired_reads = read2_stats is not None + if paired_reads: + columns = ['Read1 FASTQ', 'File Size', 'Reads', 'Mean Read Length', 'Mean Read Quality', + 'Reads Passing Q30', 'Read2 FASTQ', 'File Size', 'Reads', 'Mean Read Length', 'Mean Read Quality', + 'Reads Passing Q30', 'Total Reads', 'All Mapped Reads', 'Unmapped Reads', + 'Unmapped Reads Percentage of Total', 'Reference with Coverage', 'Average Depth of Coverage', + 'Good SNP Count', 'Reference'] + else: + columns = ['FASTQ', 'File Size', 'Mean Read Length', 'Mean Read Quality', 'Reads Passing Q30', + 'Total Reads', 'All Mapped Reads', 'Unmapped Reads', 'Unmapped Reads Percentage of Total', + 'Reference with Coverage', 'Average Depth of Coverage', 'Good SNP Count', 'Reference'] + with open(output_file, "w") as outfh: + # Make sure the header starts with a # so + # MultiQC can properly handle the output. + outfh.write("%s\n" % "\t".join(columns)) + line_items = [] + # Get the current stats and associated files. + # Get and output the statistics. + line_items.append(read1_stats.fastq_file) + line_items.append(read1_stats.file_size) + if paired_reads: + line_items.append(read1_stats.total_reads) + line_items.append(read1_stats.mean_read_length) + line_items.append(read1_stats.mean_read_quality) + line_items.append(read1_stats.reads_passing_q30) + if paired_reads: + line_items.append(read2_stats.fastq_file) + line_items.append(read2_stats.file_size) + line_items.append(read2_stats.total_reads) + line_items.append(read2_stats.mean_read_length) + line_items.append(read2_stats.mean_read_quality) + line_items.append(read2_stats.reads_passing_q30) # Total Reads - current_sample_df.at[file_name_base, 'Total Reads'] = total_reads + if paired_reads: + total_reads = read1_stats.total_reads + read2_stats.total_reads + else: + total_reads = read1_stats.total_reads + line_items.append(total_reads) # All Mapped Reads all_mapped_reads, unmapped_reads = process_idxstats_file(idxstats_file) - current_sample_df.at[file_name_base, 'All Mapped Reads'] = all_mapped_reads - # Unmapped Reads - current_sample_df.at[file_name_base, 'Unmapped Reads'] = unmapped_reads + line_items.append(all_mapped_reads) + line_items.append(unmapped_reads) # Unmapped Reads Percentage of Total if unmapped_reads > 0: unmapped_reads_percentage = '{:10.2f}'.format(unmapped_reads / total_reads) else: unmapped_reads_percentage = 0 - current_sample_df.at[file_name_base, 'Unmapped Reads Percentage of Total'] = unmapped_reads_percentage + line_items.append(unmapped_reads_percentage) # Reference with Coverage ref_with_coverage, avg_depth_of_coverage, good_snp_count = process_metrics_file(metrics_file) - current_sample_df.at[file_name_base, 'Reference with Coverage'] = ref_with_coverage - # Average Depth of Coverage - current_sample_df.at[file_name_base, 'Average Depth of Coverage'] = avg_depth_of_coverage - # Good SNP Count - current_sample_df.at[file_name_base, 'Good SNP Count'] = good_snp_count - data_frames.append(current_sample_df) - output_df = pandas.concat(data_frames) - output_df.to_csv(output_file, sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\') + line_items.append(ref_with_coverage) + line_items.append(avg_depth_of_coverage) + line_items.append(good_snp_count) + line_items.append(read1_stats.reference) + outfh.write('%s\n' % '\t'.join(str(x) for x in line_items)) def process_idxstats_file(idxstats_file): @@ -150,44 +199,17 @@ parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference dbkey') parser.add_argument('--gzipped', action='store_true', dest='gzipped', required=False, default=False, help='Input files are gzipped') -parser.add_argument('--input_idxstats_dir', action='store', dest='input_idxstats_dir', required=False, default=None, help='Samtools idxstats input directory') -parser.add_argument('--input_metrics_dir', action='store', dest='input_metrics_dir', required=False, default=None, help='vSNP add zero coverage metrics input directory') -parser.add_argument('--input_reads_dir', action='store', dest='input_reads_dir', required=False, default=None, help='Samples input directory') -parser.add_argument('--list_paired', action='store_true', dest='list_paired', required=False, default=False, help='Input samples is a list of paired reads') parser.add_argument('--output', action='store', dest='output', help='Output Excel statistics file') parser.add_argument('--read1', action='store', dest='read1', help='Required: single read') parser.add_argument('--read2', action='store', dest='read2', required=False, default=None, help='Optional: paired read') parser.add_argument('--samtools_idxstats', action='store', dest='samtools_idxstats', help='Output of samtools_idxstats') -parser.add_argument('--vsnp_azc', action='store', dest='vsnp_azc', help='Output of vsnp_add_zero_coverage') +parser.add_argument('--vsnp_azc_metrics', action='store', dest='vsnp_azc_metrics', help='Output of vsnp_add_zero_coverage') args = parser.parse_args() -fastq_files = [] +stats_list = [] idxstats_files = [] metrics_files = [] # Accumulate inputs. -if args.read1 is not None: - # The inputs are not dataset collections, so - # read1, read2 (possibly) and vsnp_azc will also - # not be None. - fastq_files.append(args.read1) - idxstats_files.append(args.samtools_idxstats) - metrics_files.append(args.vsnp_azc) - if args.read2 is not None: - fastq_files.append(args.read2) - idxstats_files.append(args.samtools_idxstats) - metrics_files.append(args.vsnp_azc) -else: - for file_name in sorted(os.listdir(args.input_reads_dir)): - fastq_files.append(os.path.join(args.input_reads_dir, file_name)) - for file_name in sorted(os.listdir(args.input_idxstats_dir)): - idxstats_files.append(os.path.join(args.input_idxstats_dir, file_name)) - if args.list_paired: - # Add the idxstats file for reverse. - idxstats_files.append(os.path.join(args.input_idxstats_dir, file_name)) - for file_name in sorted(os.listdir(args.input_metrics_dir)): - metrics_files.append(os.path.join(args.input_metrics_dir, file_name)) - if args.list_paired: - # Add the metrics file for reverse. - metrics_files.append(os.path.join(args.input_metrics_dir, file_name)) -output_statistics(fastq_files, idxstats_files, metrics_files, args.output, args.gzipped, args.dbkey) +read1_stats, read2_stats = accrue_statistics(args.dbkey, args.read1, args.read2, args.gzipped) +output_statistics(read1_stats, read2_stats, args.samtools_idxstats, args.vsnp_azc_metrics, args.output)