Galaxy |

Changeset 4:b202c97deabe (2018-10-08)

Previous changeset 3:85d870b8ae92 (2018-05-23) Next changeset 5:52454637bc45 (2018-10-17)

Commit message:
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_regions commit dfaab79252a858e8df16bbea3607ebf1b6962e5a

modified:
fsd_regions.py
fsd_regions.xml

added:
test-data/Test_data.tabular
test-data/Test_data_regions.txt
test-data/output_file.pdf
test-data/output_file.tabular

diff -r 85d870b8ae92 -r b202c97deabe fsd_regions.py
--- a/fsd_regions.py Wed May 23 15:06:27 2018 -0400
+++ b/fsd_regions.py Mon Oct 08 05:53:50 2018 -0400

[

@@ -8,37 +8,36 @@
# Takes at least one TABULAR file with tags before the alignment to the SSCS
# and a TXT with tags of reads that overlap the regions of the reference genome as input.
# The program produces a plot which shows the distribution of family sizes of the tags from the input files and
-# a CSV file with the data of the plot.
+# a tabular file with the data of the plot.

-# USAGE: python FSD_regions_1.6_FINAL.py --inputFile filenameSSCS --inputName1 filenameSSCS --ref_genome  filenameRefGenome --sep "characterWhichSeparatesCSVFile" --output_csv outptufile_name_csv --output_pdf outptufile_name_pdf
+# USAGE: python FSD_regions_1.6_FINAL.py --inputFile filenameSSCS --inputName1 filenameSSCS --ref_genome  filenameRefGenome --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf

-import numpy
-import matplotlib.pyplot as plt
import argparse
import sys
-import os
+
+import matplotlib.pyplot as plt
+import numpy
from matplotlib.backends.backend_pdf import PdfPages

+plt.switch_backend('agg')
+
+
def readFileReferenceFree(file, delim):
     with open(file, 'r') as dest_f:
         data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter=delim, comments='#', dtype='string')
         return(data_array)

+
def make_argparser():
     parser = argparse.ArgumentParser(description='Family Size Distribution of tags which were aligned to regions of the reference genome')
-    parser.add_argument('--inputFile',
-                        help='Tabular File with three columns: ab or ba, tag and family size.')
+    parser.add_argument('--inputFile', help='Tabular File with three columns: ab or ba, tag and family size.')
     parser.add_argument('--inputName1')
-    parser.add_argument('--ref_genome',
-                        help='TXT File with tags of reads that overlap the region.')
-    parser.add_argument('--output_pdf', default="data.pdf", type=str,
-                       help='Name of the pdf and csv file.')
-    parser.add_argument('--output_csv', default="data.csv", type=str,
-                        help='Name of the pdf and csv file.')
-    parser.add_argument('--sep', default=",",
-                        help='Separator in the csv file.')
+    parser.add_argument('--ref_genome', help='TXT File with tags of reads that overlap the region.')
+    parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf and tabular file.')
+    parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the pdf and tabular file.')
     return parser

+
def compare_read_families_refGenome(argv):
     parser = make_argparser()
     args = parser.parse_args(argv[1:])
@@ -48,12 +47,8 @@
     name1 = name1.split(".tabular")[0]
     refGenome = args.ref_genome
     title_file = args.output_pdf
-    title_file2 = args.output_csv
-    sep = args.sep
-
-    if type(sep) is not str or len(sep) > 1:
-        print("Error: --sep must be a single character.")
-        exit(3)
+    title_file2 = args.output_tabular
+    sep = "\t"

     with open(title_file2, "w") as output_file, PdfPages(title_file) as pdf:
         data_array = readFileReferenceFree(firstFile, "\t")
@@ -105,7 +100,7 @@
         maximumX = numpy.amax(numpy.concatenate(quantAfterRegion))
         minimumX = numpy.amin(numpy.concatenate(quantAfterRegion))

-        ### PLOT ###
+        # PLOT
         plt.rc('figure', figsize=(11.69, 8.27))  # A4 format
         plt.rcParams['axes.facecolor'] = "E0E0E0"  # grey background color
         plt.rcParams['xtick.labelsize'] = 14
@@ -156,7 +151,7 @@
             plt.text(0.75, 0.05 + s, "{:,}\n".format(len(count) / 2), size=11, transform=plt.gcf().transFigure)

         plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)
-        #plt.title(name1, fontsize=14)
+        # plt.title(name1, fontsize=14)
         plt.xlabel("Family size", fontsize=14)
         plt.ylabel("Absolute Frequency", fontsize=14)
         plt.grid(b=True, which="major", color="#424242", linestyle=":")
@@ -175,19 +170,19 @@
         output_file.write("\n\nValues from family size distribution\n")
         output_file.write("{}".format(sep))
         for i in groupUnique:
-            output_file.write("{}{}".format(i,sep))
+            output_file.write("{}{}".format(i, sep))
         output_file.write("\n")
-        j=0
-        for fs in counts[1][0:len(counts[1])-1]:
+        j = 0
+        for fs in counts[1][0:len(counts[1]) - 1]:
             if fs == 21:
                 fs = ">20"
             else:
                 fs = "={}".format(fs)
-            output_file.write("FS{}{}".format(fs,sep))
+            output_file.write("FS{}{}".format(fs, sep))
             for n in range(len(groupUnique)):
                 output_file.write("{}{}".format(int(counts[0][n][j]), sep))
             output_file.write("\n")
-            j+=1
+            j += 1
         output_file.write("sum{}".format(sep))
         for i in counts[0]:
             output_file.write("{}{}".format(int(sum(i)), sep))
@@ -195,11 +190,12 @@
         output_file.write("\n\nIn the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n")
         output_file.write("Region{}total nr. of tags per region\n".format(sep))
         for i, count in zip(groupUnique, quantAfterRegion):
-            output_file.write("{}{}{}\n".format(i,sep,len(count) / 2))
-        output_file.write("sum of tags{}{}\n".format(sep,length_regions))
+            output_file.write("{}{}{}\n".format(i, sep, len(count) / 2))
+        output_file.write("sum of tags{}{}\n".format(sep, length_regions))

     print("Files successfully created!")
-    #print("Files saved under {}.pdf and {}.csv in {}!".format(title_file, title_file, os.getcwd()))
+    # print("Files saved under {}.pdf and {}.csv in {}!".format(title_file, title_file, os.getcwd()))
+

if __name__ == '__main__':
-  sys.exit(compare_read_families_refGenome(sys.argv))
+    sys.exit(compare_read_families_refGenome(sys.argv))

diff -r 85d870b8ae92 -r b202c97deabe fsd_regions.xml
--- a/fsd_regions.xml Wed May 23 15:06:27 2018 -0400
+++ b/fsd_regions.xml Mon Oct 08 05:53:50 2018 -0400

[

@@ -1,22 +1,29 @@
<?xml version="1.0" encoding="UTF-8"?>
-<tool id="fsd_regions" name="Duplex Sequencing Analysis: fsd_regions" version="0.0.4">
+<tool id="fsd_regions" name="Duplex Sequencing Analysis: fsd_regions" version="1.0.0">
+    <description>Family size distribution (FSD) of user-specified regions in the reference genome</description>
     <requirements>
         <requirement type="package" version="2.7">python</requirement>
-        <requirement type="package" version="1.4">matplotlib</requirement>
+        <requirement type="package" version="1.4.0">matplotlib</requirement>
     </requirements>
-    <description>Family size distribution (FSD) of user-specified regions</description>
     <command>
-        python2 $__tool_directory__/fsd_regions.py --inputFile "$file1" --inputName1 "$file1.name" --ref_genome "$file2" --sep $separator --output_pdf $output_pdf --output_csv $output_csv
+        python2 '$__tool_directory__/fsd_regions.py' --inputFile '$file1' --inputName1 '$file1.name' --ref_genome '$file2' --output_pdf $output_pdf --output_tabular $output_tabular
     </command>
     <inputs>
         <param name="file1" type="data" format="tabular" label="Dataset 1: input tags of whole dataset" optional="false" help="Input in tabular format with the family size, tags and the direction of the strand ('ab' or 'ba') for each family."/>
-        <param name="file2" type="data" format="txt" label="Dataset 2: input tags aligned to the reference genome" help="Input in txt format with the regions and the tags, which were aligned to the reference genome."/>
-        <param name="separator" type="text" label="Separator of the CSV file." help="can be a single character" value=","/>
+        <param name="file2" type="data" format="txt" label="Dataset 2: input tags aligned to the reference genome" help="Input in txt format with the regions in the reference genome and the tags, which were aligned to the reference genome."/>
     </inputs>
     <outputs>
         <data name="output_pdf" format="pdf" />
-        <data name="output_csv" format="csv"/>
+        <data name="output_tabular" format="tabular"/>
     </outputs>
+    <tests>
+        <test>
+            <param name="file1" value="Test_data.tabular"/>
+            <param name="file2" value="Test_data_regions.txt"/>
+            <output name="output_pdf" file="output_file.pdf" lines_diff="136"/>
+            <output name="output_tabular" file="output_file.tabular"/>
+        </test>
+    </tests>
     <help> <![CDATA[

**What it does**
@@ -49,7 +56,7 @@

**Output**

-    The output is a PDF file with the plot and a CSV with the data of the plot.
+    The output is a PDF file with the plot and a tabular file with the data of the plot.


**About Author**

diff -r 85d870b8ae92 -r b202c97deabe test-data/Test_data.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Test_data.tabular Mon Oct 08 05:53:50 2018 -0400

@@ -0,0 +1,32 @@
+10 AAAAAACATCCCAATAAGAAATCA ab
+9 AAAAAACATCCCAATAAGAAATCA ba
+4 AAAAAAGTCCTTCGACTCAAGCGG ab
+5 AAAAAAGTCCTTCGACTCAAGCGG ba
+5 AAAAAATAGTTAAGCCGACACACT ab
+7 AAAAAATAGTTAAGCCGACACACT ba
+7 AAAAAATGTGCCGAACCTTGGCGA ab
+10 AAAAAATGTGCCGAACCTTGGCGA ba
+7 AAAAACAACATAGCTTGAAAATTT ab
+4 AAAAACAACATAGCTTGAAAATTT ba
+81 ATTCGGATAATTCGACGCAACATT ab
+11 ATTCGGATAATTCGACGCAACATT ba
+41 ATTCGTCGACAATACAAAGGGGCC ab
+226 ATTCGTCGACAATACAAAGGGGCC ba
+6 ATTGCCAGTGTGGGCTGGTTAGTA ab
+41 ATTGCCAGTGTGGGCTGGTTAGTA ba
+50 ATTTCGCGACCATCCGCCACTTTG ab
+332 ATTTCGCGACCATCCGCCACTTTG ba
+64 CAAACTTTAGCACAGTGTGTGTCC ab
+57 CAAACTTTAGCACAGTGTGTGTCC ba
+85 ATAAACGGCCTTCGACATTGTGAC ab
+15 ATAAACGGCCTTCGACATTGTGAC ba
+11 ATAAAGTCACCTGTGAATACGTTG ab
+35 ATAAAGTCACCTGTGAATACGTTG ba
+83 ATAAATCGAAACCGTGCCCAACAA ab
+63 ATAAATCGAAACCGTGCCCAACAA ba
+9 ATTTAGATATTTTCTTCTTTTTCT ab
+7 ATTTAGATATTTTCTTCTTTTTCT ba
+7 ATTTAGTTATCCGTCGGCGACGAA ab
+3 ATTTAGTTATCCGTCGGCGACGAA ba
+8 ATTTAGTTTGAATTGCCCTGCGTC ab
+9 ATTTAGTTTGAATTGCCCTGCGTC ba
\ No newline at end of file

diff -r 85d870b8ae92 -r b202c97deabe test-data/Test_data_regions.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Test_data_regions.txt Mon Oct 08 05:53:50 2018 -0400

@@ -0,0 +1,17 @@
+87_636 AAAAAACATCCCAATAAGAAATCA
+87_636 AAAAAAGTCCTTCGACTCAAGCGG
+87_636 AAAAAATAGTTAAGCCGACACACT
+87_636 AAAAAATGTGCCGAACCTTGGCGA
+87_636 AAAAACAACATAGCTTGAAAATTT
+656_1143 ATTCGGATAATTCGACGCAACATT
+656_1143 ATTCGTCGACAATACAAAGGGGCC
+656_1143 ATTGCCAGTGTGGGCTGGTTAGTA
+656_1143 ATTTCGCGACCATCCGCCACTTTG
+656_1143 CAAACTTTAGCACAGTGTGTGTCC
+1141_1564 ATAAACGGCCTTCGACATTGTGAC
+1141_1564 ATAAAGTCACCTGTGAATACGTTG
+1141_1564 ATAAATCGAAACCGTGCCCAACAA
+1892_2398 ATTTAGATATTTTCTTCTTTTTCT
+1892_2398 ATTTAGTTATCCGTCGGCGACGAA
+1892_2398 ATTTAGTTTGAATTGCCCTGCGTC
+

diff -r 85d870b8ae92 -r b202c97deabe test-data/output_file.pdf

Binary file test-data/output_file.pdf has changed

diff -r 85d870b8ae92 -r b202c97deabe test-data/output_file.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_file.tabular Mon Oct 08 05:53:50 2018 -0400

@@ -0,0 +1,41 @@
+Dataset: Test_data
+ AB BA
+max. family size: 85 332
+absolute frequency: 9 1
+relative frequency: 0.209 0.062
+
+total nr. of reads 1312
+
+
+Values from family size distribution
+ 87_636 656_1143 1141_1564 1892_2398
+FS=3 0 0 0 1
+FS=4 2 0 0 0
+FS=5 2 0 0 0
+FS=6 0 1 0 0
+FS=7 3 0 0 2
+FS=8 0 0 0 1
+FS=9 1 0 0 2
+FS=10 2 0 0 0
+FS=11 0 1 1 0
+FS=12 0 0 0 0
+FS=13 0 0 0 0
+FS=14 0 0 0 0
+FS=15 0 0 1 0
+FS=16 0 0 0 0
+FS=17 0 0 0 0
+FS=18 0 0 0 0
+FS=19 0 0 0 0
+FS=20 0 0 0 0
+FS>20 0 8 4 0
+sum 10 10 6 6
+
+
+In the plot, both family sizes of the ab and ba strands were used.
+Whereas the total numbers indicate only the single count of the tags per region.
+Region total nr. of tags per region
+87_636 5
+656_1143 5
+1141_1564 3
+1892_2398 3
+sum of tags 16