# HG changeset patch
# User mheinzl
# Date 1543225101 18000
# Node ID fca9b000b8d815b84af67907033f5eefad097838
# Parent  e80557c091e9beda80dd68be6ed3f47bf95b46b3
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_beforevsafter commit 82b53a26581a2296ad272bbba1e80934864dfa58

diff -r e80557c091e9 -r fca9b000b8d8 fsd_beforevsafter.py
--- a/fsd_beforevsafter.py	Mon Nov 26 04:32:45 2018 -0500
+++ b/fsd_beforevsafter.py	Mon Nov 26 04:38:21 2018 -0500
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # Family size distribution of DCS from various steps of the Galaxy pipeline
 #
-# Author: Monika Heinzl, Johannes-Kepler University Linz (Austria)
+# Author: Monika Heinzl & Gundula Povysil, Johannes-Kepler University Linz (Austria)
 # Contact: monika.heinzl@edumail.at
 #
 # Takes a TXT file with tags of reads that were aligned to certain regions of the reference genome (optional),
@@ -9,15 +9,17 @@
 # a FASTA file with tags after trimming as input (optional).
 # The program produces a plot which shows the distribution of family sizes of the DCS from the input files and
 # a CSV file with the data of the plot.
-# USAGE: python FSD before vs after_no_refF1.3_FINAL.py --inputFile_SSCS filenameSSCS --inputName1 filenameSSCS --makeDCS filenameMakeDCS --afterTrimming filenameAfterTrimming -- alignedTags filenameTagsRefGenome
-# --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf
+# USAGE: python FSD before vs after_no_refF1.3_FINAL.py --inputFile_SSCS filenameSSCS --inputName1 filenameSSCS --makeDCS filenameMakeDCS --afterTrimming filenameAfterTrimming --alignedTags DCSbamFile
+# --output_tabular outputfile_name_tabular --output_pdf outputfile_name_pdf
 
 import argparse
+import re
 import sys
 from collections import Counter
 
 import matplotlib.pyplot as plt
 import numpy
+import pysam
 from Bio import SeqIO
 from matplotlib.backends.backend_pdf import PdfPages
 
@@ -53,8 +55,8 @@
                         help='FASTA File with information about tag and family size in the header.')
     parser.add_argument('--afterTrimming', default=None,
                         help='FASTA File with information about tag and family size in the header.')
-    parser.add_argument('--alignedTags', default=None,
-                        help=' TXT file with tags aligned to the reference genome and family size.')
+    parser.add_argument('--bamFile',
+                        help='BAM file with aligned reads.')
     parser.add_argument('--output_pdf', default="data.pdf", type=str,
                         help='Name of the pdf and tabular file.')
     parser.add_argument('--output_tabular', default="data.tabular", type=str,
@@ -65,12 +67,12 @@
 def compare_read_families_read_loss(argv):
     parser = make_argparser()
     args = parser.parse_args(argv[1:])
-
+    #
     SSCS_file = args.inputFile_SSCS
     SSCS_file_name = args.inputName1
     makeConsensus = args.makeDCS
     afterTrimming = args.afterTrimming
-    ref_genome = args.alignedTags
+    ref_genome = args.bamFile
     title_file = args.output_tabular
     title_file2 = args.output_pdf
     sep = "\t"
@@ -164,17 +166,25 @@
 
 # data of tags aligned to reference genome
         if ref_genome != str(None):
-            mut_array = readFileReferenceFree(ref_genome, " ")
+            pysam.index(ref_genome)
+            bam = pysam.AlignmentFile(ref_genome, "rb")
+            seq_mut = []
+            for read in bam.fetch():
+                if not read.is_unmapped:
+                    if re.search('_', read.query_name):
+                        tags = re.split('_', read.query_name)[0]
+                    else:
+                        tags = read.query_name
+                    seq_mut.append(tags)
+
             # use only unique tags that were alignment to the reference genome
-            seq_mut, seqMut_index = numpy.unique(numpy.array(mut_array[:, 1]), return_index=True)
-
-            # get family sizes
+            seq_mut = numpy.array(seq_mut)
+            seq_mut, seqMut_index = numpy.unique(seq_mut, return_index=True)
+            # get family sizes for each tag in the BAM file
             quant_ab = []
+            quant_ba = []
             for i in seq_mut:
                 quant_ab.append(seqDic_ab.get(i))
-
-            quant_ba = []
-            for i in seq_mut:
                 quant_ba.append(seqDic_ba.get(i))
 
             quant_ab_ref = numpy.array(quant_ab)
@@ -182,6 +192,7 @@
             quant_all_ref = numpy.concatenate((quant_ab_ref, quant_ba_ref))
             bigFamilies = numpy.where(quant_all_ref > 20)[0]  # group large family sizes
             quant_all_ref[bigFamilies] = 22
+
             list1.append(quant_all_ref)
             colors.append("#04cec7")
             labels.append("after alignment\nto reference")
@@ -198,22 +209,21 @@
         ticks1[len(ticks1) - 1] = ">20"
         plt.xticks(numpy.array(ticks), ticks1)
         if ref_genome != str(None):
-            count = numpy.array(
-                [v for k, v in sorted(Counter(quant_ab_ref).iteritems())])  # count all family sizes from all ab strands
+            count = numpy.array([v for k, v in sorted(Counter(quant_ab_ref).iteritems())])  # count all family sizes from all ab strands
 
-            legend = "max. family size =\nabsolute frequency=\nrelative frequency=\n\ntotal nr. of reads (before)"
-            plt.text(0.1, 0.105, legend, size=11, transform=plt.gcf().transFigure)
+            legend = "max. family size:\nabsolute frequency:\nrelative frequency:\n\ntotal nr. of reads:\n(before SSCS building)"
+            plt.text(0.1, 0.085, legend, size=11, transform=plt.gcf().transFigure)
 
             legend = "AB\n{}\n{}\n{:.5f}\n\n{:,}" \
                 .format(max(quant_ab_ref), count[len(count) - 1], float(count[len(count) - 1]) / sum(count),
                         sum(numpy.array(data_array[:, 0]).astype(int)))
-            plt.text(0.3, 0.105, legend, size=11, transform=plt.gcf().transFigure)
+            plt.text(0.35, 0.105, legend, size=11, transform=plt.gcf().transFigure)
 
             count2 = numpy.array(
                 [v for k, v in sorted(Counter(quant_ba_ref).iteritems())])  # count all family sizes from all ba strands
             legend = "BA\n{}\n{}\n{:.5f}" \
                 .format(max(quant_ba_ref), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2))
-            plt.text(0.4, 0.15, legend, size=11, transform=plt.gcf().transFigure)
+            plt.text(0.45, 0.1475, legend, size=11, transform=plt.gcf().transFigure)
 
         legend4 = "* In the plot, the family sizes of ab and ba strands and of both duplex tags were used.\nWhereas the total numbers indicate only the single count of the formed duplex tags."
         plt.text(0.1, 0.02, legend4, size=11, transform=plt.gcf().transFigure)
@@ -239,12 +249,12 @@
                 "relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep,
                                                                  float(count2[len(count2) - 1]) / sum(count2)))
 
-        output_file.write("\n\ntotal nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int))))
+        output_file.write("\ntotal nr. of reads before SSCS building{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int))))
         output_file.write("\n\nValues from family size distribution\n")
 
         if afterTrimming == str(None) and ref_genome == str(None):
             if afterTrimming == str(None):
-                output_file.write("{}before SSCS buidling{}after DCS building\n".format(sep, sep))
+                output_file.write("{}before SSCS building{}after DCS building\n".format(sep, sep))
             elif ref_genome == str(None):
                 output_file.write("{}before SSCS building{}atfer DCS building\n".format(sep, sep))
 
@@ -258,7 +268,7 @@
 
         elif afterTrimming == str(None) or ref_genome == str(None):
             if afterTrimming == str(None):
-                output_file.write("{}before SSCS buidling{}after DCS building{}after alignment to reference\n".format(sep, sep, sep))
+                output_file.write("{}before SSCS building{}after DCS building{}after alignment to reference\n".format(sep, sep, sep))
             elif ref_genome == str(None):
                 output_file.write("{}before SSCS building{}atfer DCS building{}after trimming\n".format(sep, sep, sep))
 
diff -r e80557c091e9 -r fca9b000b8d8 fsd_beforevsafter.xml
--- a/fsd_beforevsafter.xml	Mon Nov 26 04:32:45 2018 -0500
+++ b/fsd_beforevsafter.xml	Mon Nov 26 04:38:21 2018 -0500
@@ -1,9 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <tool id="fsd_beforevsafter" name="FSD Before/After" version="1.0.1">
     <description>: Family Size Distribution of duplex sequecning tags during DuNovo analysis</description>
-    <macros>
-        <import>fsd_reg_macros.xml</import>
-    </macros>
     <requirements>
 		<!-- galaxy version 16.04 -->
         <requirement type="package" version="2.7">python</requirement>
@@ -65,9 +62,24 @@
         
 The output is a PDF file with the plot and a tabular file with the data of the plot.
          
-@author@
+**About Author**
+        
+Author: Monika Heinzl
+    
+Department: Institute of Bioinformatics, Johannes Kepler University Linz, Austria
+    
+Contact: monika.heinzl@edumail.at
         
-]]> 
+        ]]> 
+
     </help>
-    <expand macro="@citation@" />
+    <citations>
+        <citation type="bibtex">
+            @misc{duplex,
+            author = {Heinzl, Monika},
+            year = {2018},
+            title = {Development of algorithms for the analysis of duplex sequencing data}
+         }
+        </citation>
+    </citations>
 </tool>