# HG changeset patch # User immport-devteam # Date 1488218077 18000 # Node ID 8283ff163ba6372004e7e195ceaaa730d26339ae Uploaded diff -r 000000000000 -r 8283ff163ba6 flow_overview/color_palette.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/flow_overview/color_palette.py Mon Feb 27 12:54:37 2017 -0500 @@ -0,0 +1,48 @@ +###################################################################### +# Copyright (c) 2016 Northrop Grumman. +# All rights reserved. +###################################################################### + +color_palette = [ + '#000000', # Black 0 + '#FF0000', # Red 1 + '#FFFF00', # Yellow 2 + '#008000', # Dark Green 3 + '#0000FF', # Blue 4 + '#FFA500', # Orange 5 + '#8A2BE2', # BlueViolet 6 + '#808000', # Olive 7 + '#00FFFF', # Cyan 8 + '#FF00FF', # Magenta 9 + '#00FF00', # Green 10 + '#000080', # Navy Blue 11 + '#F08080', # Light Coral 12 + '#800080', # Purple 13 + '#F0E68C', # Khaki 14 + '#8FBC8F', # Dark Sea Green 15 + '#2F4F4F', # Dark Slate Grey 16 + '#008080', # Teal 17 + '#9932CC', # Dark Orchid 18 + '#FF7F50', # Coral 19 + '#FFD700', # Gold 20 + '#008B8B', # Cyan 4 21 + '#800000', # Maroon 22 + '#5F9EA0', # Cadet Blue 23 + '#FFC0CB', # Pink 24 + '#545454', # Grey 25 + '#7FFFD4', # Aquamarine 26 + '#ADD8E6', # Light Blue 27 + '#DB7093', # Medium Violet Red 28 + '#CD853F', # Tan 3 29 + '#4169E1', # Royal Blue 30 + '#708090', # Slate Grey 31 + '#4682B4', # Steel Blue 32 + '#D8BFD8', # Thistle 33 + '#F5DEB3', # Wheat 34 + '#9ACD32', # Yellow Green 35 + '#BDB76B', # Dark Khaki 36 + '#8B008B', # Magenta 4 37 + '#556B2F', # Dark Olive Green 38 + '#00CED1', # Dark Turquoise 39 + '#FF1493' # Deep Pink +] diff -r 000000000000 -r 8283ff163ba6 flow_overview/flowstatlib.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/flow_overview/flowstatlib.py Mon Feb 27 12:54:37 2017 -0500 @@ -0,0 +1,100 @@ +###################################################################### +# Copyright (c) 2016 Northrop Grumman. +# All rights reserved. +###################################################################### +from __future__ import print_function +import sys +import pandas as pd +from scipy.stats import gmean +from argparse import ArgumentParser + + +def gen_overview_stats(file_name): + flow_stats = {} + fcs = pd.read_table(file_name) + (events, columns) = fcs.shape + flow_stats['fcs'] = fcs + flow_stats['events'] = events + flow_stats['columns'] = columns - 1 + flow_stats['data'] = fcs.iloc[:, :-1] + flow_stats['population'] = fcs.iloc[:, -1:].iloc[:, 0] + flow_stats['population_freq'] = flow_stats['population'].value_counts() + flow_stats['population_sample'] = (flow_stats['population_freq'] * (20000/float(events))).round(decimals=0) + flow_stats['population_freq_sort'] = flow_stats['population_freq'].sort_index() + flow_stats['population_per'] = (flow_stats['population'].value_counts(normalize=True) * 100).round(decimals=2) + flow_stats['population_per_sort'] = flow_stats['population_per'].sort_index() + flow_stats['population_all'] = pd.concat([flow_stats['population_freq_sort'], flow_stats['population_per_sort']], axis=1) + flow_stats['population_all'].columns = ['Count', 'Percentage'] + flow_stats['min'] = flow_stats['data'].values.min() + flow_stats['max'] = flow_stats['data'].values.max() + flow_stats['markers'] = list(flow_stats['data'].columns) + flow_stats['mfi'] = fcs.groupby('Population').mean().round(decimals=2) + flow_stats['mfi_pop'] = pd.merge(flow_stats['mfi'], flow_stats['population_all'], left_index=True, right_index=True) + flow_stats['mfi_pop']['Population'] = flow_stats['mfi_pop'].index + flow_stats['gmfi'] = fcs.groupby('Population').agg(lambda x: gmean(list(x))).round(decimals=2) + flow_stats['gmfi_pop'] = pd.merge(flow_stats['gmfi'], flow_stats['population_all'], left_index=True, right_index=True) + flow_stats['gmfi_pop']['Population'] = flow_stats['gmfi_pop'].index + flow_stats['mdfi'] = fcs.groupby('Population').median().round(decimals=2) + flow_stats['mdfi_pop'] = pd.merge(flow_stats['mdfi'], flow_stats['population_all'], left_index=True, right_index=True) + flow_stats['mdfi_pop']['Population'] = flow_stats['mdfi_pop'].index + + # + # If the number of events is less than 20000, then return + # the complete data set, + # Otherwise sample the data to only return 20000 events. + if events <= 20000: + flow_stats['sample'] = fcs + else: + fcs_np = fcs.values + sample_data = [] + pop_found = {} + for i in range(0, events): + population_number = fcs_np[i][columns-1] + if population_number in pop_found: + if pop_found[population_number] < flow_stats['population_sample'][population_number]: + pop_found[population_number] += 1 + sample_data.append(fcs_np[i]) + else: + pop_found[population_number] = 1 + sample_data.append(fcs_np[i]) + flow_stats['sample'] = pd.DataFrame(sample_data) + flow_stats['sample'].columns = fcs.columns + + flow_stats['sample_data'] = flow_stats['sample'].iloc[:, :-1] + flow_stats['sample_population'] = flow_stats['sample'].iloc[:, -1:].iloc[:, 0] + + return flow_stats + + +if __name__ == '__main__': + parser = ArgumentParser( + prog="flowstats", + description="Gets statistics on FLOCK run") + + parser.add_argument( + '-i', + dest="input_file", + required=True, + help="File locations for flow clr file.") + + parser.add_argument( + '-o', + dest="out_file", + required=True, + help="Path to the directory for the output file.") + args = parser.parse_args() + + flow_stats = gen_overview_stats(args.input_file) + with open(args.out_file, "w") as outf: + outf.write("Events: ", flow_stats['events']) + outf.write("Min: ", flow_stats['min']) + outf.write("Max: ", flow_stats['max']) + outf.write("Columns: ", flow_stats['columns']) + outf.write("Markers: ", flow_stats['markers']) + outf.write("Population: ", flow_stats['population']) + outf.write("Population Freq: ", flow_stats['population_freq']) + outf.write("Population Sample: ", flow_stats['population_sample']) + outf.write("Population Per: ", flow_stats['population_per']) + outf.write("Sample Data contains ", len(flow_stats['sample']), " events") + outf.write("MIF_POP ", flow_stats['mfi_pop']) + sys.exit(0) diff -r 000000000000 -r 8283ff163ba6 flow_overview/genFlowOverview.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/flow_overview/genFlowOverview.py Mon Feb 27 12:54:37 2017 -0500 @@ -0,0 +1,330 @@ +#!/usr/bin/env python +###################################################################### +# Copyright (c) 2016 Northrop Grumman. +# All rights reserved. +###################################################################### +from __future__ import print_function +import sys +import os +import pandas as pd +import logging +import fileinput + +from argparse import ArgumentParser +from jinja2 import Environment, FileSystemLoader +from collections import defaultdict + +from color_palette import color_palette +from flowstatlib import gen_overview_stats +import matplotlib as mpl +mpl.use('Agg') +import matplotlib.pyplot as plt + + +profile_key = { + "1": "-", + "2": "lo", + "3": "+", + "4": "hi" +} + + +# flow CL functions +def run_flowCL(phenotype, output_txt, output_pdf, tool): + run_command = " ". join(["Rscript --slave --vanilla", tool, "--args", + output_txt, phenotype]) + os.system(run_command) + + get_graph = " ".join(["mv flowCL_results/*.pdf", output_pdf]) + os.system(get_graph) + return + + +def generate_flowCL_query(list_markers, list_types): + if (len(list_markers) != len(list_types)): + return("pb with headers") + query = [] + # go through both lists, remove fsc/ssc + for i in range(1, len(list_markers)): + if not list_markers[i].startswith("FSC") and not list_markers[i].startswith("SSC"): + query.append(list_markers[i].upper()) + query.append(profile_key[list_types[i]]) + # return concatenated string + return("".join(query)) + + +def translate_profiles(input_file, tool_dir, html_dir): + tool = "/".join([tool_dir, "getOntology.R"]) + html_table = "".join([html_dir, "/CLprofiles.txt"]) + score_table = "".join(["cp ", input_file, " ", html_dir, "/scores.txt"]) + os.system(score_table) + + # read profile + with open(input_file, "r") as flock_profiles, open(html_table, "w") as out: + headers = flock_profiles.readline() + headers = headers.strip() + markers = headers.split("\t")[:-2] + counter = 0 + + out.write("Population\tFlowCL Query\tNb Results\tLink to PDF\t") + out.write("Top Result Label\tTop Result Score\tTop Result CL\n") + queries = {} + # create marker query for each population + for lines in flock_profiles: + lines = lines.strip("\n") + pop_profile = lines.split("\t")[:-2] + flowcl_query = generate_flowCL_query(markers, pop_profile) + counter += 1 + nb_results = "0" + top_label = "no_match" + top_score = "NA" + top_CL = "NA" + pdf_link = "NA" + + # check if query was run before + if flowcl_query not in queries: + # create filenames for results & graphs + txt = "".join(["flowcl_pop", str(counter).zfill(2), ".txt"]) + text_result = "/".join([html_dir, txt]) + graph = "".join(["flowcl_pop", str(counter).zfill(2), ".pdf"]) + graph_output = "/".join([html_dir, graph]) + # run flowCL for each marker profile + run_flowCL(flowcl_query, text_result, graph_output, tool) + + # test that text file exists if not results are all NAs: + if os.path.isfile(text_result): + with open(text_result, "r") as res: + for line in res: + if line.startswith("Score"): + data = line.split(") ") + top_score = data[2][:-2] + tot_results = len(data) - 2 + nb_results = str(tot_results) + if tot_results == 5: + if len(data[6].split("+")) > 1: + nb_results = "5+" + elif line.startswith("Cell ID"): + prep_link = line.split(") ")[1][:-2] + cl = prep_link.replace("_", ":") + link = "".join(['']) + top_CL = "".join([link, prep_link, ""]) + elif line.startswith("Cell Label"): + top_label = line.split(") ")[1][:-2] + pdf_link = "".join(['PDF']) + tmpflowcl_query = "".join(['', flowcl_query, '']) + queries[flowcl_query] = { + "query": tmpflowcl_query, + "results": nb_results, + "pdf": pdf_link, + "label": top_label, + "score": top_score, + "CL": top_CL + } + + # write query results to CLprofiles.txt + out.write("\t".join([pop_profile[0], + queries[flowcl_query]["query"], + queries[flowcl_query]["results"], + queries[flowcl_query]["pdf"], + queries[flowcl_query]["label"], + queries[flowcl_query]["score"], + queries[flowcl_query]["CL"]]) + "\n") + + +# boxplots data massaging +def panel_to_json_string(panel): + # from http://stackoverflow.com/questions/28078118/merge-many-json-strings-with-python-pandas-inputs + def __merge_stream(key, stream): + return '"' + key + '"' + ': ' + stream + ', ' + try: + stream = '{' + for item in panel.items: + stream += __merge_stream(item, panel.loc[item, :, :].to_json()) + # take out extra last comma + stream = stream[:-2] + # add the final paren + stream += '}' + except: + logging.exception('Panel Encoding did not work') + return stream + + +def get_outliers(group, upper, lower): + cat = group.name + out = {} + for marker in group: + out[marker] = group[(group[marker] > upper.loc[cat][marker]) | (group[marker] < lower.loc[cat][marker])][marker] + return out + + +def get_boxplot_stats(all_data, mfi_file, output_json): + # modified code from http://bokeh.pydata.org/en/latest/docs/gallery/boxplot.html + df = pd.read_table(all_data) + pops = df.groupby('Population') + q1 = pops.quantile(q=0.25) + q2 = pops.quantile(q=0.5) + q3 = pops.quantile(q=0.75) + iqr = q3 - q1 + upper = q3 + 1.5*iqr + lower = q1 - 1.5*iqr + resampled = False + # get outliers + out = pops.apply(get_outliers, upper, lower).dropna() + outliers = defaultdict(dict) + for population in set(df.Population): + for marker in df.columns: + if marker != 'Population': + tmp_outliers = list(out[population][marker]) + if (len(list(out[population][marker]))> 100): + tmp_outliers = list(out[population][marker].sample(n=100)) + resampled = True + outliers[population][marker] = tmp_outliers + outdf = pd.DataFrame(outliers) + # Get initial MFI values + mfi = pd.read_table(mfi_file) + mfi = mfi.set_index('Population') + + data = {'q1': q1, + 'q2': q2, + 'q3': q3, + 'upper': upper, + 'lower': lower, + 'outliers': outdf.T, + 'mfi': mfi} + wp = pd.Panel(data) + + with open(output_json, "w") as js_all: + js_all.write(panel_to_json_string(wp)) + + return resampled + +# html generation +def gen_flow_overview(args): + flow_stats = gen_overview_stats(args.input_file) + os.mkdir(args.output_directory) + html_template = "genOverview.template" + + if args.scores: + translate_profiles(args.scores, args.tool_directory, args.output_directory) + html_template = "genOverviewCL.template" + + env = Environment(loader=FileSystemLoader(args.tool_directory + "/templates")) + template = env.get_template(html_template) + + real_directory = args.output_directory.replace("/job_working_directory", "") + context = {'outputDirectory': real_directory} + overview = template.render(**context) + with open(args.output_file, "w") as f: + f.write(overview) + + flow_sample_file_name = args.output_directory + "/flow.sample" + with open(flow_sample_file_name, "w") as flow_sample_file: + flow_stats['sample'].to_csv(flow_sample_file, sep="\t", index=False, float_format='%.0f') + + flow_mfi_file_name = args.output_directory + "/flow.mfi" + with open(flow_mfi_file_name, "w") as flow_mfi_file: + flow_stats[args.mfi_calc].to_csv(flow_mfi_file, sep="\t", float_format='%.0f') + + mpop = "_".join([args.mfi_calc, "pop"]) + flow_mfi_pop_file_name = args.output_directory + "/flow.mfi_pop" + with open(flow_mfi_pop_file_name, "w") as flow_mfi_pop_file: + flow_stats[mpop].to_csv(flow_mfi_pop_file, sep="\t", index=False, float_format="%.2f") + + # box plot data + boxplot_data = args.output_directory + "/boxplotData.json" + resampled = get_boxplot_stats(args.input_file, flow_mfi_file_name, boxplot_data) + + # Generate the Images -- eventually we should change that over to D3 + fcm = flow_stats['sample_data'].values + colors = [] + for i, j in enumerate(flow_stats['sample_population']): + colors.append(color_palette[j]) + + for i in range(flow_stats['columns']): + for j in range(flow_stats['columns']): + file_name = "m" + str(i) + "_m" + str(j) + ax = plt.subplot(1, 1, 1) + plt.subplots_adjust(left=0.0, bottom=0.0, right=1.0, top=1.0, wspace=0.0, hspace=0.0) + plt.scatter(fcm[:, i], fcm[:, j], s=1, c=colors, edgecolors='none') + plt.axis([0, 1024, 0, 1024]) + plt.xticks([]) + plt.yticks([]) + F = plt.gcf() + F.set_size_inches(1, 1) + F.set_dpi(90) + png_file = file_name + "_90X90.png" + F.savefig(args.output_directory + "/" + png_file) + plt.clf() + + flow_overview_file_name = args.output_directory + "/flow.overview" + with open(flow_overview_file_name, "w") as flow_overview_file: + flow_overview_file.write("
| \n") + for i in range(flow_stats['columns']): + flow_overview_file.write(" | " + flow_stats['markers'][i] + "\n") + + for i in range(flow_stats['columns']): + flow_overview_file.write(" | 
| " + flow_stats['markers'][i] + "\n") + for j in range(flow_stats['columns']): + file_name = "m" + str(j) + "_m" + str(i) + image_file = file_name + "_90X90.png" + flow_overview_file.write(' |