diff FCStxtMergeDownsample.py @ 1:3c0e4179be7a draft default tip

"planemo upload for repository https://github.com/ImmPortDB/immport-galaxy-tools/tree/master/flowtools/merge_ds_flowtext commit 7858e5b085fc3c60c88fe87b2f343969d50d9b1e"
author azomics
date Mon, 22 Jun 2020 17:42:26 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/FCStxtMergeDownsample.py	Mon Jun 22 17:42:26 2020 -0400
@@ -0,0 +1,243 @@
+#!/usr/bin/env python
+
+######################################################################
+#                  Copyright (c) 2016 Northrop Grumman.
+#                          All rights reserved.
+######################################################################
+
+from __future__ import print_function
+from __future__ import division
+import sys
+import os
+import pandas as pd
+from argparse import ArgumentParser
+
+
+def is_number(s):
+    try:
+        float(s)
+        return True
+    except ValueError:
+        return False
+
+
+def is_integer(s):
+    try:
+        int(s)
+        return True
+    except ValueError:
+        return False
+
+
+def compare_headers(files):
+    headers = {}
+    for eachfile in files:
+        with open(eachfile, "r") as ef:
+            headers[eachfile] = ef.readline().strip().lower().split("\t")
+
+    hdgs_in_common = []
+    flag = {}
+
+    for ref_hdgs in headers[files[0]]:
+        flag[ref_hdgs] = 1
+
+        for ij in range(1, len(files)):
+            if ref_hdgs in headers[files[ij]]:
+                flag[ref_hdgs] += 1
+        if flag[ref_hdgs] == len(files):
+            hdgs_in_common.append(ref_hdgs)
+
+    if not hdgs_in_common:
+        sys.exit(9)
+    return(hdgs_in_common)
+
+
+def get_nb_lines(files):
+    tot_event = 0
+    for f in files:
+        df = pd.read_table(f)
+        tot_event += (len(df.index) - 1)
+    return(tot_event)
+
+
+def get_headers_index(list_headings, headings):
+    idxs = []
+    lhdgs = [x.lower() for x in headings]
+    for element in list_headings:
+        idxs.append(int(lhdgs.index(element)))
+    return(idxs)
+
+
+def merge_and_DS_txt(in_files, out_file, col_names, factor_ds):
+    """Concatenates together tab-separated files.
+    The output will have only the columns in common to all the files provided
+    as input, as determined by the headers.
+    All lines after the header line must contain only numbers.
+    Potential errors are logged to stderr. If the number of errors reaches 10,
+    the program stops.
+    If a downsampling factor is given, returns the indicated fraction of
+    random lines.
+    """
+
+    nb_errors = 0
+    max_error = 10
+
+    # get list of headers in common to all files
+    list_hdgs = compare_headers(in_files)
+    total_events = get_nb_lines(in_files)
+    total_final = total_events * ds_factor
+    nb_per_file = int(total_final / len(in_files))
+
+    with open(out_file, "w") as outf:
+        ff_order = []
+        # HEADERS:
+        with open(in_files[0], "r") as first_file:
+            headings_ff = first_file.readline().strip()
+            headings = headings_ff.split("\t")
+            # Get index of headers in common:
+            hdrs_idx = get_headers_index(list_hdgs, headings)
+
+            # If column to merge on were provided:
+            if col_names:
+                for ix in col_names:
+                    if ix not in hdrs_idx:
+                        nb_errors += 1
+                        sys.stderr.write(" ".join(["WARNING: column", str(ix), "in", in_files[0],
+                                                   "does not exist in all files or has a different header.\n"]))
+                        if nb_errors == max_error:
+                            exit_code = 4
+                            sys.stderr.write("Run aborted - too many errors.")
+                            os.remove(out_file)
+                hdrs_idx = col_names
+
+            # Print out to output file:
+            headings_to_write = []
+            for cti in range(0, len(headings)):
+                if cti in hdrs_idx:
+                    headings_to_write.append(headings[cti])
+                    ff_order.append(headings[cti])
+            outf.write("\t".join(headings_to_write) + "\n")
+
+        # DATA
+        for infile in in_files:
+            with open(infile, "r") as inf:
+                headings_inf = inf.readline().strip()
+                hdgs = headings_inf.split("\t")
+                # Get the index of columns to keep:
+                hdgs_idx = []
+                for ctc in ff_order:
+                    hdgs_idx.append(int(hdgs.index(ctc)))
+                if col_names:
+                    for iy in col_names:
+                        if iy not in hdgs_idx:
+                            nb_errors += 1
+                            sys.stderr.write(" ".join(["WARNING: column", str(iy), "in", infile,
+                                                       "does not exist in all files or has a different header.\n"]))
+                            if nb_errors == max_error:
+                                exit_code = 4
+                                sys.stderr.write("Run aborted - too many errors.")
+                                os.remove(out_file)
+                    hdgs_idx = col_names
+
+            df = pd.read_table(infile, usecols=hdrs_idx)
+            df_ds = df.sample(nb_per_file, replace=False)
+
+            for cols in df_ds.columns.values:
+                if df_ds[cols].count() != len(df_ds[cols]):
+                    sys.stderr.write(infile + "contains non-numeric data\n")
+
+                    with open(infile, "r") as checkfile:
+                        fl = checkfile.readline()
+                        count_lines = 1
+                        for checklines in checkfile:
+                            to_check = checklines.strip().split("\t")
+                            count_lines += 1
+                            for item in to_check:
+                                if not is_number(item):
+                                    sys.stderr.write(" ".join(["WARNING: line", str(count_lines),
+                                                               "in", infile, "contains non-numeric results\n"]))
+                    sys.exit(2)
+
+            df_ds = df_ds.ix[:, ff_order]
+            df_ds.to_csv(outf, sep="\t", header=False, index=False)
+
+    if nb_errors > 0:
+        exit_code = 3
+        if nb_errors == max_error:
+            exit_code = 4
+            sys.stderr.write("Run aborted - too many errors.")
+            os.remove(out_file)
+        sys.exit(exit_code)
+    return
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(
+             prog="FCStxtmerge",
+             description="Merge based on headers text-converted FCS files into one text file.")
+
+    parser.add_argument(
+            '-i',
+            dest="input_files",
+            required=True,
+            action='append',
+            help="File location for the text files.")
+
+    parser.add_argument(
+            '-o',
+            dest="output_file",
+            required=True,
+            help="Name of the output file.")
+
+    parser.add_argument(
+            '-c',
+            dest="columns",
+            help="Specify which column to keep in output file")
+
+    parser.add_argument(
+            '-d',
+            dest="downsampling_factor",
+            help="How much of each file to keep")
+
+    args = parser.parse_args()
+
+    # Get columns to merge on if any:
+    default_value_col = ["i.e.:1,2,5", "default", "Default"]
+    columns = []
+    if args.columns:
+        if args.columns not in default_value_col:
+            tmp_col = args.columns.split(",")
+            if len(tmp_col) == 1:
+                if not tmp_col[0].strip():
+                    columns = []
+                elif not is_integer(tmp_col[0].strip()):
+                    sys.exit(7)
+                else:
+                    columns.append(int(tmp_col[0].strip()) - 1)
+            else:
+                for c in range(0, len(tmp_col)):
+                    if not is_integer(tmp_col[c].strip()):
+                        sys.exit(6)
+                    else:
+                        columns.append(int(tmp_col[c].strip()) - 1)
+
+    # Get down sampling factor if any:
+    # Note: change '%' to 'X' because somehow that's what Galaxy passes?
+    default_value_ds = ["i.e.:0.1 or 10X", "default", "Default"]
+    ds_factor = 0.1
+    if args.downsampling_factor:
+        if args.downsampling_factor not in default_value_ds:
+            args.downsampling_factor = args.downsampling_factor.strip()
+            downsampling_factor = args.downsampling_factor.rstrip("X")
+            if is_number(downsampling_factor):
+                ds_factor = float(downsampling_factor)
+                if ds_factor > 1 and ds_factor <= 100:
+                    ds_factor = float(downsampling_factor) / 100
+                elif ds_factor > 100 or ds_factor <= 0:
+                    sys.stderr.write(str(ds_factor))
+                    sys.exit(8)
+            else:
+                sys.exit(8)
+
+    input_files = [f for f in args.input_files]
+    merge_and_DS_txt(input_files, args.output_file, columns, ds_factor)