# HG changeset patch # User immport-devteam # Date 1488218582 18000 # Node ID 426650130311bb8516e2e9a830a3a6b7e059f1c2 Uploaded diff -r 000000000000 -r 426650130311 merge_ds_flowtext/FCStxtMergeDownsample.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_ds_flowtext/FCStxtMergeDownsample.py Mon Feb 27 13:03:02 2017 -0500 @@ -0,0 +1,225 @@ +#!/usr/bin/env python + +###################################################################### +# Copyright (c) 2016 Northrop Grumman. +# All rights reserved. +###################################################################### + +from __future__ import print_function +from __future__ import division +import sys +import os +import pandas as pd +from argparse import ArgumentParser + + +def is_number(s): + try: + float(s) + return True + except ValueError: + return False + + +def is_integer(s): + try: + int(s) + return True + except ValueError: + return False + + +def compare_headers(files): + headers = {} + for eachfile in files: + with open(eachfile, "r") as ef: + headers[eachfile] = ef.readline().strip().lower().split("\t") + + hdgs_in_common = [] + flag = {} + + for ref_hdgs in headers[files[0]]: + flag[ref_hdgs] = 1 + + for ij in range(1, len(files)): + if ref_hdgs in headers[files[ij]]: + flag[ref_hdgs] += 1 + if flag[ref_hdgs] == len(files): + hdgs_in_common.append(ref_hdgs) + + if not hdgs_in_common: + sys.exit(9) + return(hdgs_in_common) + + +def get_headers_index(list_headings, headings): + idxs = [] + lhdgs = [x.lower() for x in headings] + for element in list_headings: + idxs.append(int(lhdgs.index(element))) + return(idxs) + + +def merge_and_DS_txt(in_files, out_file, col_names, factor_ds): + """Concatenates together tab-separated files. + The output will have only the columns in common to all the files provided + as input, as determined by the headers. + All lines after the header line must contain only numbers. + Potential errors are logged to stderr. If the number of errors reaches 10, + the program stops. + If a downsampling factor is given, returns the indicated fraction of + random lines. + """ + + nb_errors = 0 + max_error = 10 + + # get list of headers in common to all files + list_hdgs = compare_headers(in_files) + + with open(out_file, "w") as outf: + ff_order = [] + # HEADERS: + with open(in_files[0], "r") as first_file: + headings_ff = first_file.readline().strip() + headings = headings_ff.split("\t") + # Get index of headers in common: + hdrs_idx = get_headers_index(list_hdgs, headings) + + # If column to merge on were provided: + if col_names: + for ix in col_names: + if ix not in hdrs_idx: + nb_errors += 1 + sys.stderr.write(" ".join(["WARNING: column", str(ix), "in", in_files[0], + "does not exist in all files or has a different header.\n"])) + hdrs_idx = col_names + + # Print out to output file: + headings_to_write = [] + for cti in range(0, len(headings)): + if cti in hdrs_idx: + headings_to_write.append(headings[cti]) + ff_order.append(headings[cti]) + outf.write("\t".join(headings_to_write) + "\n") + + # DATA + for infile in in_files: + with open(infile, "r") as inf: + headings_inf = inf.readline().strip() + hdgs = headings_inf.split("\t") + # Get the index of columns to keep: + hdgs_idx = [] + for ctc in ff_order: + hdgs_idx.append(int(hdgs.index(ctc))) + if col_names: + for iy in col_names: + if iy not in hdgs_idx: + nb_errors += 1 + sys.stderr.write(" ".join(["WARNING: column", str(iy), "in", infile, + "does not exist in all files or has a different header.\n"])) + hdgs_idx = col_names + + df = pd.read_table(infile, usecols=hdrs_idx) + wc_file = len(df.index) - 1 + df_ds = df.sample(int(wc_file * factor_ds), replace=False) + + for cols in df_ds.columns.values: + if df_ds[cols].count() != len(df_ds[cols]): + sys.stderr.write(infile + "contains non-numeric data\n") + + with open(infile, "r") as checkfile: + fl = checkfile.readline() + count_lines = 1 + for checklines in checkfile: + to_check = checklines.strip().split("\t") + count_lines += 1 + for item in to_check: + if not is_number(item): + sys.stderr.write(" ".join(["WARNING: line", str(count_lines), + "in", infile, "contains non-numeric results\n"])) + sys.exit(2) + + df_ds = df_ds.ix[:, ff_order] + df_ds.to_csv(outf, sep="\t", header=False, index=False) + + if nb_errors > 0: + exit_code = 3 + if nb_errors == max_error: + exit_code = 4 + sys.stderr.write("Run aborted - too many errors.") + os.remove(out_file) + sys.exit(exit_code) + return + + +if __name__ == "__main__": + parser = ArgumentParser( + prog="FCStxtmerge", + description="Merge based on headers text-converted FCS files into one text file.") + + parser.add_argument( + '-i', + dest="input_files", + required=True, + action='append', + help="File location for the text files.") + + parser.add_argument( + '-o', + dest="output_file", + required=True, + help="Name of the output file.") + + parser.add_argument( + '-c', + dest="columns", + help="Specify which column to keep in output file") + + parser.add_argument( + '-d', + dest="downsampling_factor", + help="How much of each file to keep") + + args = parser.parse_args() + + # Get columns to merge on if any: + default_value_col = ["i.e.:1,2,5", "default", "Default"] + columns = [] + if args.columns: + if args.columns not in default_value_col: + tmp_col = args.columns.split(",") + if len(tmp_col) == 1: + if not tmp_col[0].strip(): + columns = [] + elif not is_integer(tmp_col[0].strip()): + sys.exit(7) + else: + columns.append(int(tmp_col[0].strip()) - 1) + else: + for c in range(0, len(tmp_col)): + if not is_integer(tmp_col[c].strip()): + sys.exit(6) + else: + columns.append(int(tmp_col[c].strip()) - 1) + + # Get down sampling factor if any: + # Note: change '%' to 'X' because somehow that's what Galaxy passes? + default_value_ds = ["i.e.:0.1 or 10X", "default", "Default"] + ds_factor = 1 + if args.downsampling_factor: + if args.downsampling_factor not in default_value_ds: + args.downsampling_factor = args.downsampling_factor.strip() + downsampling_factor = args.downsampling_factor.rstrip("X") + if is_number(downsampling_factor): + ds_factor = float(downsampling_factor) + if ds_factor > 1: + ds_factor = float(downsampling_factor) / 100 + if ds_factor > 100: + sys.exit(8) + else: + sys.exit(8) + + input_files = [f for f in args.input_files] + merge_and_DS_txt(input_files, args.output_file, columns, ds_factor) + sys.exit(0) diff -r 000000000000 -r 426650130311 merge_ds_flowtext/FCStxtMergeDownsample.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_ds_flowtext/FCStxtMergeDownsample.xml Mon Feb 27 13:03:02 2017 -0500 @@ -0,0 +1,175 @@ + + txt-converted FCS files into one text file based on headers. + + numpy + pandas + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 10.1038/srep02327 + + diff -r 000000000000 -r 426650130311 merge_ds_flowtext/test-data/merge1.flowtext --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_ds_flowtext/test-data/merge1.flowtext Mon Feb 27 13:03:02 2017 -0500 @@ -0,0 +1,19 @@ +CD4 CCR3 CD8 CCR7 +437 69 0 146 +551 129 169 292 +199 277 320 227 +83 138 335 194 +534 111 83 177 +499 0 0 224 +175 361 225 237 +216 310 270 294 +519 44 51 148 +550 200 0 127 +552 479 0 62 +525 121 0 138 +438 0 626 480 +139 227 293 259 +0 292 641 327 +30 147 483 386 +537 338 568 201 +156 228 734 408 diff -r 000000000000 -r 426650130311 merge_ds_flowtext/test-data/merge2.flowtext --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_ds_flowtext/test-data/merge2.flowtext Mon Feb 27 13:03:02 2017 -0500 @@ -0,0 +1,25 @@ +Forward Scatter Side Scatter FITC CD4 +340 115 509 +262 73 437 +894 1023 199 +316 76 50 +449 157 551 +388 97 534 +383 139 499 +394 144 83 +372 126 519 +788 1023 216 +1023 1023 289 +363 76 550 +668 1019 73 +420 211 552 +770 1023 175 +602 578 385 +418 105 561 +352 153 30 +383 190 156 +733 970 139 +451 120 537 +373 104 3 +358 185 0 +289 56 438 diff -r 000000000000 -r 426650130311 merge_ds_flowtext/test-data/test1/input1.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_ds_flowtext/test-data/test1/input1.txt Mon Feb 27 13:03:02 2017 -0500 @@ -0,0 +1,10 @@ +CD4 CCR3 CD8 CCR7 +551 129 169 292 +199 277 320 227 +437 69 0 146 +509 268 0 74 +50 0 60 129 +83 138 335 194 +499 0 0 224 +239 284 288 280 +534 111 83 177 diff -r 000000000000 -r 426650130311 merge_ds_flowtext/test-data/test1/input2.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_ds_flowtext/test-data/test1/input2.txt Mon Feb 27 13:03:02 2017 -0500 @@ -0,0 +1,10 @@ +CD4 CCR3 CD8 CCR7 +550 200 0 127 +519 44 51 148 +289 401 362 254 +175 361 225 237 +525 121 0 138 +385 286 222 131 +216 310 270 294 +552 479 0 62 +73 193 227 132 diff -r 000000000000 -r 426650130311 merge_ds_flowtext/test-data/test1/input3.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_ds_flowtext/test-data/test1/input3.txt Mon Feb 27 13:03:02 2017 -0500 @@ -0,0 +1,10 @@ +CD4 CCR3 CD8 CCR7 +438 0 626 480 +30 147 483 386 +156 228 734 408 +432 121 598 555 +537 338 568 201 +3 110 621 584 +561 0 610 562 +0 292 641 327 +139 227 293 259 diff -r 000000000000 -r 426650130311 merge_ds_flowtext/test-data/test2/input1.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_ds_flowtext/test-data/test2/input1.txt Mon Feb 27 13:03:02 2017 -0500 @@ -0,0 +1,10 @@ +Forward Scatter Side Scatter FITC CD4 PE CCR3 PP CD8 APC CCR4 +449 157 551 129 169 292 +894 1023 199 277 320 227 +262 73 437 69 0 146 +340 115 509 268 0 74 +316 76 50 0 60 129 +394 144 83 138 335 194 +383 139 499 0 0 224 +800 1023 239 284 288 280 +388 97 534 111 83 177 diff -r 000000000000 -r 426650130311 merge_ds_flowtext/test-data/test2/input2.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_ds_flowtext/test-data/test2/input2.txt Mon Feb 27 13:03:02 2017 -0500 @@ -0,0 +1,10 @@ +Forward Scatter Side Scatter FITC CD4 PE CXCR3 PP CD8 APC CCR5 +363 76 550 200 0 127 +372 126 519 44 51 148 +1023 1023 289 401 362 254 +770 1023 175 361 225 237 +384 111 525 121 0 138 +602 578 385 286 222 131 +788 1023 216 310 270 294 +420 211 552 479 0 62 +668 1019 73 193 227 132 diff -r 000000000000 -r 426650130311 merge_ds_flowtext/test-data/test2/input3.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_ds_flowtext/test-data/test2/input3.txt Mon Feb 27 13:03:02 2017 -0500 @@ -0,0 +1,10 @@ +Forward Scatter Side Scatter FITC CD4 PE CD25 PP CD3 APC CD45RA +289 56 438 0 626 480 +352 153 30 147 483 386 +383 190 156 228 734 408 +261 62 432 121 598 555 +451 120 537 338 568 201 +373 104 3 110 621 584 +418 105 561 0 610 562 +358 185 0 292 641 327 +733 970 139 227 293 259