changeset 0:b82fdb006304 draft

planemo upload for repository https://github.com/monikaheinzl/galaxyProject/tree/master/tools/fsd_regions commit 6055f8c5c052f528ff85fb5e0d43b4500830637a
author mheinzl
date Thu, 10 May 2018 07:28:39 -0400
parents
children 9ce2b4089c1b
files fsd_regions.py fsd_regions.xml
diffstat 2 files changed, 279 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fsd_regions.py	Thu May 10 07:28:39 2018 -0400
@@ -0,0 +1,205 @@
+#!/usr/bin/env python
+
+# Family size distribution of tags which were aligned to the reference genome
+#
+# Author: Monika Heinzl, Johannes-Kepler University Linz (Austria)
+# Contact: monika.heinzl@edumail.at
+#
+# Takes at least one TABULAR file with tags before the alignment to the SSCS
+# and a TXT with tags of reads that overlap the regions of the reference genome as input.
+# The program produces a plot which shows the distribution of family sizes of the tags from the input files and
+# a CSV file with the data of the plot.
+
+# USAGE: python FSD_regions_1.6_FINAL.py --inputFile filenameSSCS --inputName1 filenameSSCS --ref_genome  filenameRefGenome --sep "characterWhichSeparatesCSVFile" --output_csv outptufile_name_csv --output_pdf outptufile_name_pdf
+
+import numpy
+import matplotlib.pyplot as plt
+import argparse
+import sys
+import os
+from matplotlib.backends.backend_pdf import PdfPages
+
+def readFileReferenceFree(file, delim):
+    with open(file, 'r') as dest_f:
+        data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter=delim, comments='#', dtype='string')
+        return(data_array)
+
+def make_argparser():
+    parser = argparse.ArgumentParser(description='Family Size Distribution of tags which were aligned to regions of the reference genome')
+    parser.add_argument('--inputFile',
+                        help='Tabular File with three columns: ab or ba, tag and family size.')
+    parser.add_argument('--inputName1')
+    parser.add_argument('--ref_genome',
+                        help='TXT File with tags of reads that overlap the region.')
+    parser.add_argument('--output_csv', default="data.csv", type=str,
+                        help='Name of the pdf and csv file.')
+    parser.add_argument('--output_pdf', default="data.pdf", type=str,
+                        help='Name of the pdf and csv file.')
+    parser.add_argument('--sep', default=",",
+                        help='Separator in the csv file.')
+    return parser
+
+def compare_read_families_refGenome(argv):
+    parser = make_argparser()
+    args = parser.parse_args(argv[1:])
+
+    firstFile = args.inputFile
+    name1 = args.inputName1
+    name1 = name1.split(".tabular")[0]
+    refGenome = args.ref_genome
+    title_file = args.output_pdf
+    title_file2 = args.output_csv
+    sep = args.sep
+
+    if type(sep) is not str or len(sep) > 1:
+        print("Error: --sep must be a single character.")
+        exit(3)
+
+    with open(title_file2, "w") as output_file, PdfPages(title_file) as pdf:
+        data_array = readFileReferenceFree(firstFile, "\t")
+
+        mut_array = readFileReferenceFree(refGenome, " ")
+        length_regions = len(mut_array)
+
+        seq = numpy.array(data_array[:, 1])
+        tags = numpy.array(data_array[:, 2])
+        quant = numpy.array(data_array[:, 0]).astype(int)
+        group = numpy.array(mut_array[:, 0])
+
+        all_ab = seq[numpy.where(tags == "ab")[0]]
+        all_ba = seq[numpy.where(tags == "ba")[0]]
+        quant_ab = quant[numpy.where(tags == "ab")[0]]
+        quant_ba = quant[numpy.where(tags == "ba")[0]]
+
+        seqDic_ab = dict(zip(all_ab, quant_ab))
+        seqDic_ba = dict(zip(all_ba, quant_ba))
+
+        seq_mut = numpy.array(mut_array[:, 1])
+
+        groupUnique, group_index = numpy.unique(group, return_index=True)
+        groupUnique = groupUnique[numpy.argsort(group_index)]
+
+        lst_ab = []
+        for i in seq_mut:
+            lst_ab.append(seqDic_ab.get(i))
+
+        lst_ba = []
+        for i in seq_mut:
+            lst_ba.append(seqDic_ba.get(i))
+
+        quant_ab = numpy.array(lst_ab)
+        quant_ba = numpy.array(lst_ba)
+
+        quantAfterRegion = []
+        for i in groupUnique:
+            dataAB = quant_ab[numpy.where(group == i)[0]]
+            dataBA = quant_ba[numpy.where(group == i)[0]]
+            bigFamilies = numpy.where(dataAB > 20)[0]
+            dataAB[bigFamilies] = 22
+            bigFamilies = numpy.where(dataBA > 20)[0]
+            dataBA[bigFamilies] = 22
+
+            quantAll = numpy.concatenate((dataAB, dataBA))
+            quantAfterRegion.append(quantAll)
+
+        maximumX = numpy.amax(numpy.concatenate(quantAfterRegion))
+        minimumX = numpy.amin(numpy.concatenate(quantAfterRegion))
+
+        ### PLOT ###
+        plt.rc('figure', figsize=(11.69, 8.27))  # A4 format
+        plt.rcParams['axes.facecolor'] = "E0E0E0"  # grey background color
+        plt.rcParams['xtick.labelsize'] = 12
+        plt.rcParams['ytick.labelsize'] = 12
+        plt.rcParams['patch.edgecolor'] = "black"
+        fig = plt.figure()
+        plt.subplots_adjust(bottom=0.3)
+
+        colors = ["#6E6E6E", "#0431B4", "#5FB404", "#B40431", "#F4FA58", "#DF7401", "#81DAF5"]
+
+        col = []
+        for i in range(0, len(groupUnique)):
+            col.append(colors[i])
+
+        counts = plt.hist(quantAfterRegion, bins=range(minimumX, maximumX + 1), stacked=False, label=groupUnique,
+                          align="left", alpha=1, color=col, edgecolor="black", linewidth=1)
+        ticks = numpy.arange(minimumX - 1, maximumX, 1)
+
+        ticks1 = map(str, ticks)
+        ticks1[len(ticks1) - 1] = ">20"
+        plt.xticks(numpy.array(ticks), ticks1)
+        count = numpy.bincount(map(int, quant_ab))  # original counts
+
+        legend = "max. family size =\nabsolute frequency=\nrelative frequency=\n\ntotal nr. of reads="
+        plt.text(0.15, 0.105, legend, size=11, transform=plt.gcf().transFigure)
+
+        legend = "AB\n{}\n{}\n{:.5f}\n\n{:,}" \
+            .format(max(map(int, quant_ab)), count[len(count) - 1], float(count[len(count) - 1]) / sum(count),
+                    sum(numpy.array(data_array[:, 0]).astype(int)))
+        plt.text(0.35, 0.105, legend, size=11, transform=plt.gcf().transFigure)
+
+        count2 = numpy.bincount(map(int, quant_ba))  # original counts
+
+        legend = "BA\n{}\n{}\n{:.5f}" \
+            .format(max(map(int, quant_ba)), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2))
+        plt.text(0.45, 0.15, legend, size=11, transform=plt.gcf().transFigure)
+
+        legend1 = "total nr. of tags="
+        legend2 = "total numbers * \n{:,}".format(length_regions)
+        plt.text(0.6, 0.2, legend1, size=11, transform=plt.gcf().transFigure)
+        plt.text(0.75, 0.2, legend2, size=11, transform=plt.gcf().transFigure)
+        legend4 = "* In the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n"
+        plt.text(0.1, 0.02, legend4, size=11, transform=plt.gcf().transFigure)
+
+        space = numpy.arange(0, len(groupUnique), 0.02)
+        for i, s, count in zip(groupUnique, space, quantAfterRegion):
+            plt.text(0.6, 0.05 + s, "{}=\n".format(i), size=11, transform=plt.gcf().transFigure)
+            plt.text(0.75, 0.05 + s, "{:,}\n".format(len(count) / 2), size=11, transform=plt.gcf().transFigure)
+
+        plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)
+        plt.title(name1, fontsize=14)
+        plt.xlabel("No. of Family Members", fontsize=12)
+        plt.ylabel("Absolute Frequency", fontsize=12)
+        plt.grid(b=True, which="major", color="#424242", linestyle=":")
+        plt.margins(0.01, None)
+
+        # plt.savefig("{}_regions.pdf".format(title_file), bbox_inch="tight")
+        pdf.savefig(fig, bbox_inch="tight")
+        plt.close()
+
+        output_file.write("Dataset:{}{}\n".format(sep, firstFile))
+        output_file.write("{}AB{}BA\n".format(sep, sep))
+        output_file.write("max. family size:{}{}{}{}\n".format(sep, max(map(int, quant_ab)), sep, max(map(int, quant_ba))))
+        output_file.write("absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1]))
+        output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2)))
+        output_file.write("total nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int))))
+        output_file.write("\n\nValues from family size distribution\n")
+        output_file.write("{}".format(sep))
+        for i in groupUnique:
+            output_file.write("{}{}".format(i,sep))
+        output_file.write("\n")
+        j=0
+        for fs in counts[1][0:len(counts[1])-1]:
+            if fs == 21:
+                fs = ">20"
+            else:
+                fs = "={}".format(fs)
+            output_file.write("FS{}{}".format(fs,sep))
+            for n in range(len(groupUnique)):
+                output_file.write("{}{}".format(int(counts[0][n][j]), sep))
+            output_file.write("\n")
+            j+=1
+        output_file.write("sum{}".format(sep))
+        for i in counts[0]:
+            output_file.write("{}{}".format(int(sum(i)), sep))
+        output_file.write("\n")
+        output_file.write("\n\nIn the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n")
+        output_file.write("Region{}total nr. of tags per region\n".format(sep))
+        for i, count in zip(groupUnique, quantAfterRegion):
+            output_file.write("{}{}{}\n".format(i,sep,len(count) / 2))
+        output_file.write("sum of tags{}{}\n".format(sep,length_regions))
+
+    print("Files successfully created!")
+    #print("Files saved under {}.pdf and {}.csv in {}!".format(title_file, title_file, os.getcwd()))
+
+if __name__ == '__main__':
+  sys.exit(compare_read_families_refGenome(sys.argv))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fsd_regions.xml	Thu May 10 07:28:39 2018 -0400
@@ -0,0 +1,74 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<tool id="fsd_regions" name="Duplex Sequencing Analysis:" version="0.0.1">
+    <requirements>
+        <requirement type="package" version="2.7">python</requirement>
+        <requirement type="package" version="1.4">matplotlib</requirement>
+    </requirements>
+    <description>Family size distribution (FSD) of aligned tags to reference genome</description>
+    <command>
+        python2 $__tool_directory__/fsd_regions.py --inputFile "$file1" --inputName1 "$file1.name" --ref_genome "$file2" --sep $separator --output_csv $output_csv --output_pdf $output_pdf
+    </command>
+    <inputs>
+        <param name="file1" type="data" format="tabular" label="Dataset 1: input tags of whole dataset" optional="false" help="Input in tabular format with the family size, tags and the direction of the strand ('ab' or 'ba') for each family."/>
+        <param name="file2" type="data" format="txt" label="Dataset 2: input tags aligned to the reference genome" help="Input in txt format with the regions and the tags, which were aligned to the reference genome."/>
+        <param name="separator" type="text" label="Separator of the CSV file." help="can be a single character" value=","/>
+    </inputs>
+    <outputs>
+        <data name="output_pdf" format="pdf" />
+        <data name="output_csv" format="csv"/>
+    </outputs>
+    <help> <![CDATA[
+
+**What it does**
+        
+    This tool will create a distribution of family sizes of all tags, which were aligned to the reference genome. The distribution is separated after the regions of the reference genome.
+               
+        
+**Input**
+        
+    This tools expects a tabular file with the tags of all families, their sizes and information about forward (ab) and reverse (ba) strands. 
+    
+    +-----+----------------------------+----+
+    | 1   | AAAAAAAAAAAATGTTGGAATCTT   | ba |
+    +-----+----------------------------+----+
+    | 10  | AAAAAAAAAAAGGCGGTCCACCCC   | ab |
+    +-----+----------------------------+----+
+    | 28  | AAAAAAAAAAATGGTATGGACCGA   | ab |
+    +-----+----------------------------+----+
+    
+    In addition, a TXT file with the regions and all tags that were aligned to the reference genome is required. This file can obtained from a different tool.
+    
+    +-----------+------------------------------+
+    | 87_636    | AAATCAAAGTATGAATGAAGTTGCCT   |
+    +-----------+------------------------------+
+    | 87_636    | AAATTCATAGCATTAATTTCAACGGG   |
+    +-----------+------------------------------+
+    | 656_1143  | GGGGCAGCCATATTGGCAATTATCAT   |
+    +-----------+------------------------------+
+    
+**Output**
+        
+    The output is a PDF file with the plot and a CSV with the data of the plot.
+        
+        
+**About Author**
+        
+    Author: Monika Heinzl
+    
+    Department: Institute of Bioinformatics, Johannes Kepler University Linz, Austria
+    
+    Contact: monika.heinzl@edumail.at
+        
+        ]]> 
+
+    </help>
+    <citations>
+        <citation type="bibtex">
+            @misc{duplex,
+            author = {Heinzl, Monika},
+            year = {2018},
+            title = {Development of algorithms for the analysis of duplex sequencing data}
+         }
+        </citation>
+    </citations>
+</tool>