# HG changeset patch # User mvdbeek # Date 1546883935 18000 # Node ID 3fd7995da4fd4a08c416cbc59d00f0ce12a3e879 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_peaks commit f37f4b741fd81f663d10523e1636039578c5bb55 diff -r 000000000000 -r 3fd7995da4fd damid_deseq2_to_peaks.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/damid_deseq2_to_peaks.py Mon Jan 07 12:58:55 2019 -0500 @@ -0,0 +1,62 @@ +import click +import pandas as pd +import numpy as np + + +def order_index(df): + """ + Split chr_start_stop in df index and order by chrom and start. + """ + idx = df.index.str.split('_') + idx = pd.DataFrame.from_records(list(idx)) + + idx.columns = ['chr', 'start', 'stop'] + idx = idx.astype(dtype={"chr": "object", + "start": "int32", + "stop": "int32"}) + coordinates = idx.sort_values(['chr', 'start']) + df.index = np.arange(len(df.index)) + df = df.loc[coordinates.index] + df = coordinates.join(df) + # index is center of GATC site + df.index = df['start'] + 2 + return df + + +def significant_gatcs_to_peaks(df, p_value_cutoff): + # Add `pass` column for sig. GATCs + df['pass'] = 0 + df.loc[df[6] < p_value_cutoff, 'pass'] = 1 + # Create pass_id column for consecutive pass or no-pass GATCs + # True whenever there is a value change (from previous value): + df['pass_id'] = df.groupby('chr')['pass'].diff().ne(0).cumsum() + gb = df.groupby('pass_id') + # aggregate + consecutive_gatcs = gb.aggregate({'chr': np.min, 'start': np.min, 'stop': np.max, 'pass': np.max}) + # keep only groups with 2 or more GATCS + s = gb.size() > 1 + consecutive_only = consecutive_gatcs[s] + # drop GATC groups that are not significant + peaks = consecutive_only[consecutive_only['pass'] == 1][['chr', 'start', 'stop']] + # calculate region that is not covered. + no_peaks = consecutive_only[consecutive_only['pass'] == 0][['chr', 'start', 'stop']] + s = no_peaks['stop'] - no_peaks['start'] + print("%s nt not covered by peaks" % s.sum()) + s = peaks['stop'] - peaks['start'] + print("%s nt covered by peaks" % s.sum()) + return peaks + + +@click.command() +@click.argument('input_path', type=click.Path(exists=True)) +@click.argument('output_path', type=click.Path()) +@click.option('--p_value_cutoff', type=float, default=0.01, help="Minimum adjusted p-value for a significant GATC site") +def deseq2_gatc_to_peak(input_path, output_path, p_value_cutoff): + df = pd.read_csv(input_path, sep='\t', header=None, index_col=0) + df = order_index(df) + peaks = significant_gatcs_to_peaks(df, p_value_cutoff) + peaks.to_csv(output_path, sep='\t', header=None, index=None) + + +if __name__ == '__main__': + deseq2_gatc_to_peak() diff -r 000000000000 -r 3fd7995da4fd damid_deseq2_to_peaks.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/damid_deseq2_to_peaks.xml Mon Jan 07 12:58:55 2019 -0500 @@ -0,0 +1,25 @@ + + + click + pandas + + + + + + + + + + + + + + + + + diff -r 000000000000 -r 3fd7995da4fd test-data/peaks.bed --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/peaks.bed Mon Jan 07 12:58:55 2019 -0500 @@ -0,0 +1,2 @@ +4 1 251 +4 301 501 diff -r 000000000000 -r 3fd7995da4fd test-data/test_data.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_data.tab Mon Jan 07 12:58:55 2019 -0500 @@ -0,0 +1,7 @@ +4_1_101 179.471138895981 6.11378960505906 0.621838986245469 9.83178883970079 8.2146464523617e-23 5.91609955253763e-19 +4_101_201 294.81296137271 5.60565739977369 0.577313080365325 9.70990886994353 2.73581605967775e-22 1.38780673739166e-18 +4_201_251 169.22778018355 5.7093345566596 0.605350118969888 9.43145855224256 4.0442550874846e-21 1.15555073230549e-17 +4_251_276 169.22778018355 5.7093345566596 0.605350118969888 9.43145855224256 4.0442550874846e-21 1.15555073230549e-1 +4_276_301 169.22778018355 5.7093345566596 0.605350118969888 9.43145855224256 4.0442550874846e-21 1.15555073230549e-1 +4_301_401 412.221939093451 4.95034623243659 0.53002737065317 9.33979357770923 9.65221128246454e-21 2.46758781436206e-17 +4_401_501 165.881402812843 5.71524958810642 0.620499842499124 9.21071883771587 3.23948149277376e-20 6.66609779385658e-17