view damid_to_bedgraph.py @ 1:210b7f4c9acb draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_bedgraph commit 98722d2ca8205595f032361072aaab450e5f4f83
author mvdbeek
date Thu, 03 Jan 2019 09:33:24 -0500
parents 755cbe6825b5
children
line wrap: on
line source

from collections import OrderedDict

import click
import numpy as np
import pandas as pd
import traces


def order_index(df):
    """
    Split chr_start_stop in df index and order by chrom and start.
    """
    idx = df.index.str.split('_')
    idx = pd.DataFrame.from_records(list(idx))

    idx.columns = ['chr', 'start', 'stop']
    idx = idx.astype(dtype={"chr": "object",
                            "start": "int32",
                            "stop": "int32"})
    coordinates = idx.sort_values(['chr', 'start'])
    df.index = np.arange(len(df.index))
    df = df.loc[coordinates.index]
    df = coordinates.join(df)
    # index is center of GATC site
    df.index = df['start'] + 2
    return df


def interpolate_values(df, sampling_width=100):
    result = []
    for chrom in df['chr'].unique():
        chrom_df = df[df['chr'] == chrom]
        time_series = traces.TimeSeries(chrom_df['log2FC'])
        s = pd.DataFrame.from_records(time_series.sample(sampling_width, interpolate='linear'))
        # Calculate new start and end of interpolated region
        start = s[0] - int(sampling_width / 2)
        start.loc[start < 0] = 1
        end = s[0] + int(sampling_width / 2)
        result.append(pd.DataFrame(OrderedDict([('chr', chrom), ('start', start), ('end', end), ('score', s[1])])))
    return pd.concat(result)


@click.command()
@click.argument('input_path', type=click.Path(exists=True), required=True)
@click.argument('output_path', type=click.Path(exists=False), required=True)
@click.option('--resolution', help="Interpolate log2 fold change at this resolution (in basepairs)", default=50)
def deseq2_to_bedgraph(input_path, output_path, resolution=50):
    """Convert deseq2 output on GATC fragments to bedgraph file with interpolated values."""
    df = pd.read_csv(input_path, sep='\t', header=None, index_col=0, usecols=[0, 2], names=['GATC', 'log2FC'])
    df = df[~df.index.str.contains('\.')]
    df = order_index(df)
    r = interpolate_values(df, sampling_width=resolution)
    r.to_csv(output_path, sep='\t', header=None, index=None)


if __name__ == '__main__':
    deseq2_to_bedgraph()