Mercurial > repos > mvdbeek > damidseq_consecutive_peaks
view consecutive_peaks.py @ 1:f3ca59e53b73 draft default tip
planemo upload for repository https://github.com/bardin-lab/damid_galaxy_tools commit c753dd4f3e1863aae7ba45dcc7efdf6937b03542-dirty
author | mvdbeek |
---|---|
date | Mon, 29 Oct 2018 06:49:17 -0400 |
parents | 7f827a8e4ec5 |
children |
line wrap: on
line source
import click import numpy as np import pandas as pd SHIFTED_PADJ_COLUMN = 'shifted' CONSECUTIVE_MAX = 'consecutive_max' PEAKS_PER_GROUP = 'peaks_per_group' @click.command() @click.argument('input_file', type=click.Path(exists=True)) @click.argument('output_file', type=click.Path()) @click.argument('padj_column', default=8) @click.argument('groupby_column', default=9) @click.argument('add_number_of_peaks', default=True) def determine_consecutive_peaks(input_file, output_file, padj_column, groupby_column, add_number_of_peaks): """Finds the two lowest consecutives peaks for a group and reports""" df = pd.read_csv(input_file, sep='\t', header=None) grouped = df.groupby(groupby_column, sort=False) if add_number_of_peaks: df[PEAKS_PER_GROUP] = grouped[groupby_column].transform(np.size) df[SHIFTED_PADJ_COLUMN] = grouped[padj_column].shift() df[CONSECUTIVE_MAX] = df[[padj_column, SHIFTED_PADJ_COLUMN]].max(axis=1) grouped = df.groupby(groupby_column, sort=False) idx = grouped[CONSECUTIVE_MAX].idxmin() # index of groupwise consecutive minimum new_df = df.loc[idx] new_df.sort_values(by=CONSECUTIVE_MAX) new_df[padj_column].replace(new_df[CONSECUTIVE_MAX]) new_df = new_df.drop(labels=[CONSECUTIVE_MAX, SHIFTED_PADJ_COLUMN], axis=1) new_df.to_csv(output_file, sep='\t', header=None, na_rep="NaN") if __name__ == '__main__': determine_consecutive_peaks()