Mercurial > repos > mvdbeek > damidseq_consecutive_peaks
comparison consecutive_peaks.py @ 0:7f827a8e4ec5 draft
planemo upload for repository https://github.com/bardin-lab/damid_galaxy_tools commit c753dd4f3e1863aae7ba45dcc7efdf6937b03542-dirty
author | mvdbeek |
---|---|
date | Fri, 26 Oct 2018 11:58:06 -0400 |
parents | |
children | f3ca59e53b73 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:7f827a8e4ec5 |
---|---|
1 import click | |
2 import numpy as np | |
3 import pandas as pd | |
4 | |
5 SHIFTED_PADJ_COLUMN = 'shifted' | |
6 CONSECUTIVE_MAX = 'consecutive_max' | |
7 PEAKS_PER_GROUP = 'peaks_per_group' | |
8 | |
9 | |
10 @click.command() | |
11 @click.argument('input_file', type=click.Path(exists=True)) | |
12 @click.argument('output_file', type=click.Path()) | |
13 @click.argument('padj_column', default=8) | |
14 @click.argument('groupby_column', default=9) | |
15 @click.argument('add_number_of_peaks', default=True) | |
16 def determine_consecutive_peaks(input_file, output_file, padj_column, groupby_column, add_number_of_peaks): | |
17 """Finds the two lowest consecutives peaks for a group and reports""" | |
18 df = pd.read_csv(input_file, sep='\t', header=None) | |
19 grouped = df.groupby(groupby_column, sort=False) | |
20 if add_number_of_peaks: | |
21 df[PEAKS_PER_GROUP] = grouped[groupby_column].transform(np.size) | |
22 df[SHIFTED_PADJ_COLUMN] = grouped[8].shift() | |
23 df[CONSECUTIVE_MAX] = df[[padj_column, SHIFTED_PADJ_COLUMN]].max(axis=1) | |
24 grouped = df.groupby(groupby_column, sort=False) | |
25 idx = grouped[CONSECUTIVE_MAX].transform(min) # index of groupwise consecutive minimum | |
26 new_df = df[df[CONSECUTIVE_MAX] == idx] | |
27 new_df.sort_values(by=CONSECUTIVE_MAX) | |
28 new_df[padj_column].replace(new_df[CONSECUTIVE_MAX]) | |
29 new_df = new_df.drop(labels=[CONSECUTIVE_MAX, SHIFTED_PADJ_COLUMN], axis=1) | |
30 new_df.to_csv(output_file, sep='\t', header=None, na_rep="NaN") | |
31 | |
32 | |
33 if __name__ == '__main__': | |
34 determine_consecutive_peaks() |