vsnp_build_tables: vsnp_build_tables.py comparison

comparison vsnp_build_tables.py @ 3:abfb861df879 draft

Uploaded

author	greg
date	Sun, 03 Jan 2021 16:21:29 +0000
parents	b60858c3eb91
children	f641e52353e8

comparison

equal deleted inserted replaced

-:85384a9bfba2
+:abfb861df879
 #!/usr/bin/env python
 import argparse
-import multiprocessing
 import os
+import re
 import pandas
-import queue
 import pandas.io.formats.excel
-import re
 from Bio import SeqIO
-INPUT_JSON_AVG_MQ_DIR = 'input_json_avg_mq_dir'
-INPUT_JSON_DIR = 'input_json_dir'
-INPUT_NEWICK_DIR = 'input_newick_dir'
 # Maximum columns allowed in a LibreOffice
 # spreadsheet is 1024.  Excel allows for
 # 16,384 columns, but we'll set the lower
 # number as the maximum.  Some browsers
 # (e.g., Firefox on Linux) are configured
 all_ref = ref_df[ref_df['reference'] == gbk_chrome]
 positions = all_ref.position.to_frame()
 # Create an annotation file.
 annotation_file = "%s_annotations.csv" % group
 with open(annotation_file, "a") as fh:
-for index, row in positions.iterrows():
+for _, row in positions.iterrows():
 pos = row.position
 try:
 aaa = pro.iloc[pro.index.get_loc(int(pos))][['chrom', 'locus', 'product', 'gene']]
 try:
 chrom, name, locus, tag = aaa.values[0]
 pro.index = pandas.IntervalIndex.from_arrays(pro['start'], pro['stop'], closed='both')
 annotation_dict[chromosome] = pro
 return annotation_dict
-def get_base_file_name(file_path):
+def get_sample_name(file_path):
 base_file_name = os.path.basename(file_path)
 if base_file_name.find(".") > 0:
 # Eliminate the extension.
 return os.path.splitext(base_file_name)[0]
-elif base_file_name.find("_") > 0:
+return base_file_name
-# The dot extension was likely changed to
-# the " character.
-items = base_file_name.split("_")
-return "_".join(items[0:-1])
-else:
-return base_file_name
 def output_cascade_table(cascade_order, mqdf, group, annotation_dict):
 cascade_order_mq = pandas.concat([cascade_order, mqdf], join='inner')
 output_table(cascade_order_mq, "cascade", group, annotation_dict)
 def output_excel(df, type_str, group, annotation_dict, count=None):
 # Output the temporary json file that
 # is used by the excel_formatter.
 if count is None:
 if group is None:
-json_file_name = "%s_order_mq.json" % type_str
+json_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_order_mq.json" % type_str)
 excel_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_table.xlsx" % type_str)
 else:
-json_file_name = "%s_%s_order_mq.json" % (group, type_str)
+json_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_%s_order_mq.json" % (group, type_str))
 excel_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_%s_table.xlsx" % (group, type_str))
 else:
+# The table has more columns than is allowed by the
+# MAXCOLS setting, so multiple files will be produced
+# as an output collection.
 if group is None:
-json_file_name = "%s_order_mq_%d.json" % (type_str, count)
+json_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_order_mq_%d.json" % (type_str, count))
 excel_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_table_%d.xlsx" % (type_str, count))
 else:
-json_file_name = "%s_%s_order_mq_%d.json" % (group, type_str, count)
+json_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_%s_order_mq_%d.json" % (group, type_str, count))
 excel_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_%s_table_%d.xlsx" % (group, type_str, count))
 df.to_json(json_file_name, orient='split')
 # Output the Excel file.
 excel_formatter(json_file_name, excel_file_name, group, annotation_dict)
 output_excel(df_of_type, type_str, group_str, annotation_dict, count=count)
 else:
 output_excel(df, type_str, group_str, annotation_dict)
-def preprocess_tables(task_queue, annotation_dict, timeout):
+def preprocess_tables(newick_file, json_file, json_avg_mq_file, annotation_dict):
-while True:
+avg_mq_series = pandas.read_json(json_avg_mq_file, typ='series', orient='split')
-try:
+# Map quality to dataframe.
-tup = task_queue.get(block=True, timeout=timeout)
+mqdf = avg_mq_series.to_frame(name='MQ')
-except queue.Empty:
+mqdf = mqdf.T
-break
+# Get the group.
-newick_file, json_file, json_avg_mq_file = tup
+group = get_sample_name(newick_file)
-avg_mq_series = pandas.read_json(json_avg_mq_file, typ='series', orient='split')
+snps_df = pandas.read_json(json_file, orient='split')
-# Map quality to dataframe.
+with open(newick_file, 'r') as fh:
-mqdf = avg_mq_series.to_frame(name='MQ')
+for line in fh:
-mqdf = mqdf.T
+line = re.sub('[:,]', '\n', line)
-# Get the group.
+line = re.sub('[)(]', '', line)
-group = get_base_file_name(newick_file)
+line = re.sub(r'[0-9].*\.[0-9].*\n', '', line)
-snps_df = pandas.read_json(json_file, orient='split')
+line = re.sub('root\n', '', line)
-with open(newick_file, 'r') as fh:
+sample_order = line.split('\n')
-for line in fh:
+sample_order = list([_f for _f in sample_order if _f])
-line = re.sub('[:,]', '\n', line)
+sample_order.insert(0, 'root')
-line = re.sub('[)(]', '', line)
+tree_order = snps_df.loc[sample_order]
-line = re.sub(r'[0-9].*\.[0-9].*\n', '', line)
+# Count number of SNPs in each column.
-line = re.sub('root\n', '', line)
+snp_per_column = []
-sample_order = line.split('\n')
+for column_header in tree_order:
-sample_order = list([_f for _f in sample_order if _f])
+count = 0
-sample_order.insert(0, 'root')
+column = tree_order[column_header]
-tree_order = snps_df.loc[sample_order]
+for element in column:
-# Count number of SNPs in each column.
+if element != column[0]:
-snp_per_column = []
+count = count + 1
-for column_header in tree_order:
+snp_per_column.append(count)
-count = 0
+row1 = pandas.Series(snp_per_column, tree_order.columns, name="snp_per_column")
-column = tree_order[column_header]
+# Count number of SNPS from the
-for element in column:
+# top of each column in the table.
-if element != column[0]:
+snp_from_top = []
-count = count + 1
+for column_header in tree_order:
-snp_per_column.append(count)
+count = 0
-row1 = pandas.Series(snp_per_column, tree_order.columns, name="snp_per_column")
+column = tree_order[column_header]
-# Count number of SNPS from the
+# for each element in the column
-# top of each column in the table.
+# skip the first element
-snp_from_top = []
+for element in column[1:]:
-for column_header in tree_order:
+if element == column[0]:
-count = 0
+count = count + 1
-column = tree_order[column_header]
+else:
-# for each element in the column
+break
-# skip the first element
+snp_from_top.append(count)
-for element in column[1:]:
+row2 = pandas.Series(snp_from_top, tree_order.columns, name="snp_from_top")
-if element == column[0]:
+tree_order = tree_order.append([row1])
-count = count + 1
+tree_order = tree_order.append([row2])
-else:
+# In pandas=0.18.1 even this does not work:
-break
+# abc = row1.to_frame()
-snp_from_top.append(count)
+# abc = abc.T --> tree_order.shape (5, 18), abc.shape (1, 18)
-row2 = pandas.Series(snp_from_top, tree_order.columns, name="snp_from_top")
+# tree_order.append(abc)
-tree_order = tree_order.append([row1])
+# Continue to get error: "*** ValueError: all the input arrays must have same number of dimensions"
-tree_order = tree_order.append([row2])
+tree_order = tree_order.T
-# In pandas=0.18.1 even this does not work:
+tree_order = tree_order.sort_values(['snp_from_top', 'snp_per_column'], ascending=[True, False])
-# abc = row1.to_frame()
+tree_order = tree_order.T
-# abc = abc.T --> tree_order.shape (5, 18), abc.shape (1, 18)
+# Remove snp_per_column and snp_from_top rows.
-# tree_order.append(abc)
+cascade_order = tree_order[:-2]
-# Continue to get error: "*** ValueError: all the input arrays must have same number of dimensions"
+# Output the cascade table.
-tree_order = tree_order.T
+output_cascade_table(cascade_order, mqdf, group, annotation_dict)
-tree_order = tree_order.sort_values(['snp_from_top', 'snp_per_column'], ascending=[True, False])
+# Output the sorted table.
-tree_order = tree_order.T
+output_sort_table(cascade_order, mqdf, group, annotation_dict)
-# Remove snp_per_column and snp_from_top rows.
-cascade_order = tree_order[:-2]
-# Output the cascade table.
-output_cascade_table(cascade_order, mqdf, group, annotation_dict)
-# Output the sorted table.
-output_sort_table(cascade_order, mqdf, group, annotation_dict)
-task_queue.task_done()
-def set_num_cpus(num_files, processes):
-num_cpus = int(multiprocessing.cpu_count())
-if num_files < num_cpus and num_files < processes:
-return num_files
-if num_cpus < processes:
-half_cpus = int(num_cpus / 2)
-if num_files < half_cpus:
-return num_files
-return half_cpus
-return processes
 if __name__ == '__main__':
 parser = argparse.ArgumentParser()
-parser.add_argument('--input_avg_mq_json', action='store', dest='input_avg_mq_json', required=False, default=None, help='Average MQ json file')
-parser.add_argument('--input_newick', action='store', dest='input_newick', required=False, default=None, help='Newick file')
-parser.add_argument('--input_snps_json', action='store', dest='input_snps_json', required=False, default=None, help='SNPs json file')
 parser.add_argument('--gbk_file', action='store', dest='gbk_file', required=False, default=None, help='Optional gbk file'),
-parser.add_argument('--processes', action='store', dest='processes', type=int, help='User-selected number of processes to use for job splitting')
+parser.add_argument('--input_avg_mq_json', action='store', dest='input_avg_mq_json', help='Average MQ json file')
+parser.add_argument('--input_newick', action='store', dest='input_newick', help='Newick file')
+parser.add_argument('--input_snps_json', action='store', dest='input_snps_json', help='SNPs json file')
 args = parser.parse_args()
 if args.gbk_file is not None:
 # Create the annotation_dict for annotating
 # the Excel tables.
 annotation_dict = get_annotation_dict(args.gbk_file)
 else:
 annotation_dict = None
-# The assumption here is that the list of files
+preprocess_tables(args.input_newick, args.input_snps_json, args.input_avg_mq_json, annotation_dict)
-# in both INPUT_NEWICK_DIR and INPUT_JSON_DIR are
-# named such that they are properly matched if
-# the directories contain more than 1 file (i.e.,
-# hopefully the newick file names and json file names
-# will be something like Mbovis-01D6_* so they can be
-# sorted and properly associated with each other).
-if args.input_newick is not None:
-newick_files = [args.input_newick]
-else:
-newick_files = []
-for file_name in sorted(os.listdir(INPUT_NEWICK_DIR)):
-file_path = os.path.abspath(os.path.join(INPUT_NEWICK_DIR, file_name))
-newick_files.append(file_path)
-if args.input_snps_json is not None:
-json_files = [args.input_snps_json]
-else:
-json_files = []
-for file_name in sorted(os.listdir(INPUT_JSON_DIR)):
-file_path = os.path.abspath(os.path.join(INPUT_JSON_DIR, file_name))
-json_files.append(file_path)
-if args.input_avg_mq_json is not None:
-json_avg_mq_files = [args.input_avg_mq_json]
-else:
-json_avg_mq_files = []
-for file_name in sorted(os.listdir(INPUT_JSON_AVG_MQ_DIR)):
-file_path = os.path.abspath(os.path.join(INPUT_JSON_AVG_MQ_DIR, file_name))
-json_avg_mq_files.append(file_path)
-multiprocessing.set_start_method('spawn')
-queue1 = multiprocessing.JoinableQueue()
-queue2 = multiprocessing.JoinableQueue()
-num_files = len(newick_files)
-cpus = set_num_cpus(num_files, args.processes)
-# Set a timeout for get()s in the queue.
-timeout = 0.05
-for i, newick_file in enumerate(newick_files):
-json_file = json_files[i]
-json_avg_mq_file = json_avg_mq_files[i]
-queue1.put((newick_file, json_file, json_avg_mq_file))
-# Complete the preprocess_tables task.
-processes = [multiprocessing.Process(target=preprocess_tables, args=(queue1, annotation_dict, timeout, )) for _ in range(cpus)]
-for p in processes:
-p.start()
-for p in processes:
-p.join()
-queue1.join()
-if queue1.empty():
-queue1.close()
-queue1.join_thread()

Mercurial > repos > greg > vsnp_build_tables

comparison vsnp_build_tables.py @ 3:abfb861df879 draft