diff heinz_scoring.py @ 0:e41ec5af7472 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/heinz commit b0b2c64a46bdd9beebdfb7fc5312f75346483763
author iuc
date Thu, 02 Aug 2018 11:57:44 -0400
parents
children 5f589c91566e
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/heinz_scoring.py	Thu Aug 02 11:57:44 2018 -0400
@@ -0,0 +1,94 @@
+#!/usr/bin/env python
+"""Calculate scores for Heinz.
+
+This script transform a p-value into a score:
+    1. Use alpha and lambda to calculate a threshold P-value.
+    2. Calculate a score based on each P-value by alpha and the threshold.
+
+For more details, please refer to the paper doi:10.1093/bioinformatics/btn161
+
+Input:
+    P-values from DESeq2 result: first column: names, second column P-values
+Output:
+    Scores, which will be used as the input of Heinz.
+    First column: names, second column: scores.
+
+Python 3 is required.
+"""
+# Implemented by: Chao (Cico) Zhang
+# Homepage: https://Hi-IT.org
+# Date: 14 Mar 2017
+# Last modified: 23 May 2018
+
+import argparse
+import sys
+
+import numpy as np
+import pandas as pd
+
+
+parser = argparse.ArgumentParser(description='Transform a P-value into a '
+                                 'score which can be used as the input of '
+                                 'Heinz')
+parser.add_argument('-n', '--node', required=True, dest='nodes',
+                    metavar='nodes_pvalue.txt', type=str,
+                    help='Input file of nodes with P-values')
+parser.add_argument('-f', '--fdr', required=True, dest='fdr',
+                    metavar='0.007', type=float, help='Choose a value of FDR')
+parser.add_argument('-m', '--model', required=False, dest='param_file',
+                    metavar='param.txt', type=str,
+                    help='A txt file contains model params as input')
+parser.add_argument('-a', '--alpha', required=False, dest='alpha',
+                    metavar='0.234', type=float, default=0.5,
+                    help='Single parameter alpha as input if txt input is '
+                    'not provided')
+parser.add_argument('-l', '--lambda', required=False, dest='lam',
+                    metavar='0.345', type=float, default=0.5,
+                    help='Single parameter lambda as input if txt input is '
+                    'not provided')
+parser.add_argument('-o', '--output', required=True, dest='output',
+                    metavar='scores.txt', type=str,
+                    help='The output file to store the calculated scores')
+args = parser.parse_args()
+
+# Check if the parameters are complete
+if args.output is None:
+    sys.exit('Output file is not designated.')
+
+if args.nodes is None:
+    sys.exit('Nodes with p-values must be provided.')
+
+if args.fdr is None:
+    sys.exit('FDR must be provided')
+
+if args.fdr >= 1 or args.fdr <= 0:
+    sys.exit('FDR must greater than 0 and smaller than 1')
+
+# run heinz-print according to the input type
+if args.param_file is not None:  # if BUM output is provided
+    with open(args.param_file) as p:
+        params = p.readlines()
+        lam = float(params[0])  # Maybe this is a bug
+        alpha = float(params[1])  # Maybe this is a bug
+# if BUM output is not provided
+elif args.alpha is not None and args.lam is not None:
+    lam = args.lam
+    alpha = args.alpha
+else:  # The input is not complete
+    sys.exit('The parameters of the model are incomplete.')
+
+# Calculate the threshold P-value
+pie = lam + (1 - lam) * alpha
+p_threshold = np.power((pie - lam * args.fdr) / (args.fdr - lam * args.fdr),
+                       1 / (alpha - 1))
+print(p_threshold)
+# Calculate the scores
+input_pvalues = pd.read_csv(args.nodes, sep='\t', names=['node', 'pvalue'])
+input_pvalues.loc[:, 'score'] = input_pvalues.pvalue.apply(lambda x:
+                                                           (alpha - 1) *
+                                                           (np.log(x) -
+                                                            np.log(
+                                                                p_threshold)))
+# print(input_pvalues.loc[:, ['node', 'score']])
+input_pvalues.loc[:, ['node', 'score']].to_csv(args.output, sep='\t',
+                                               index=False, header=False)