changeset 8:238a71241876 draft

planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_beforevsafter commit dfaab79252a858e8df16bbea3607ebf1b6962e5a-dirty
author mheinzl
date Mon, 08 Oct 2018 05:55:14 -0400
parents c357ce2783a4
children e486f84adbec
files fsd_beforevsafter.py fsd_beforevsafter.xml test-data/Test_data.tabular test-data/Test_data_DCS.fasta test-data/Test_data_regions.txt test-data/Test_data_trimming.fasta test-data/output_file.pdf test-data/output_file.tabular
diffstat 8 files changed, 268 insertions(+), 123 deletions(-) [+]
line wrap: on
line diff
--- a/fsd_beforevsafter.py	Wed May 23 15:04:39 2018 -0400
+++ b/fsd_beforevsafter.py	Mon Oct 08 05:55:14 2018 -0400
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-
 # Family size distribution of DCS from various steps of the Galaxy pipeline
 #
 # Author: Monika Heinzl, Johannes-Kepler University Linz (Austria)
@@ -10,25 +9,27 @@
 # a FASTA file with tags after trimming as input (optional).
 # The program produces a plot which shows the distribution of family sizes of the DCS from the input files and
 # a CSV file with the data of the plot.
-
-# USAGE: python FSD before vs after_no_refF1.3_FINAL.py --inputFile_SSCS filenameSSCS --inputName1 filenameSSCS --makeDCS filenameMakeDCS --afterTrimming filenameAfterTrimming -- alignedTags filenameTagsRefGenome 
-# --sep "characterWhichSeparatesCSVFile" --output_csv outptufile_name_csv --output_pdf outptufile_name_pdf
-
+# USAGE: python FSD before vs after_no_refF1.3_FINAL.py --inputFile_SSCS filenameSSCS --inputName1 filenameSSCS --makeDCS filenameMakeDCS --afterTrimming filenameAfterTrimming -- alignedTags filenameTagsRefGenome
+# --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf
 
-import numpy
-import matplotlib.pyplot as plt
-from collections import Counter
-from Bio import SeqIO
 import argparse
 import sys
-import os
+from collections import Counter
+
+import matplotlib.pyplot as plt
+import numpy
+from Bio import SeqIO
 from matplotlib.backends.backend_pdf import PdfPages
 
+plt.switch_backend('agg')
+
+
 def readFileReferenceFree(file, delim):
     with open(file, 'r') as dest_f:
         data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter=delim, comments='#', dtype='string')
         return(data_array)
 
+
 def readFasta(file):
     tag_consensus = []
     fs_consensus = []
@@ -38,10 +39,11 @@
             line = record.description
             a, b = line.split(" ")
             fs1, fs2 = b.split("-")
-            fs_consensus.extend([fs1,fs2])
+            fs_consensus.extend([fs1, fs2])
     fs_consensus = numpy.array(fs_consensus).astype(int)
     return(tag_consensus, fs_consensus)
 
+
 def make_argparser():
     parser = argparse.ArgumentParser(description='Analysis of read loss in duplex sequencing data')
     parser.add_argument('--inputFile_SSCS',
@@ -49,18 +51,17 @@
     parser.add_argument('--inputName1')
     parser.add_argument('--makeDCS',
                         help='FASTA File with information about tag and family size in the header.')
-    parser.add_argument('--afterTrimming',default=None,
+    parser.add_argument('--afterTrimming', default=None,
                         help='FASTA File with information about tag and family size in the header.')
-    parser.add_argument('--alignedTags',default=None,
+    parser.add_argument('--alignedTags', default=None,
                         help=' TXT file with tags aligned to the reference genome and family size.')
     parser.add_argument('--output_pdf', default="data.pdf", type=str,
-                        help='Name of the pdf and csv file.')
-    parser.add_argument('--output_csv', default="data.csv", type=str,
-                        help='Name of the pdf and csv file.')
-    parser.add_argument('--sep', default=",",
-                        help='Separator in the csv file.')
+                        help='Name of the pdf and tabular file.')
+    parser.add_argument('--output_tabular', default="data.tabular", type=str,
+                        help='Name of the pdf and tabular file.')
     return parser
 
+
 def compare_read_families_read_loss(argv):
     parser = make_argparser()
     args = parser.parse_args(argv[1:])
@@ -70,16 +71,12 @@
     makeConsensus = args.makeDCS
     afterTrimming = args.afterTrimming
     ref_genome = args.alignedTags
-    title_file = args.output_csv
+    title_file = args.output_tabular
     title_file2 = args.output_pdf
-    sep = args.sep
-
-    if type(sep) is not str or len(sep)>1:
-        print("Error: --sep must be a single character.")
-        exit(4)
+    sep = "\t"
 
     with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf:
-        ### PLOT ###
+        # PLOT
         plt.rc('figure', figsize=(11.69, 8.27))  # A4 format
         plt.rcParams['axes.facecolor'] = "E0E0E0"  # grey background color
         plt.rcParams['xtick.labelsize'] = 14
@@ -92,7 +89,7 @@
         colors = []
         labels = []
 
-### data with tags of SSCS
+# data with tags of SSCS
         data_array = readFileReferenceFree(SSCS_file, "\t")
         seq = numpy.array(data_array[:, 1])
         tags = numpy.array(data_array[:, 2])
@@ -107,7 +104,7 @@
         seqDic_ab = dict(zip(all_ab, quant_ab_sscs))
         seqDic_ba = dict(zip(all_ba, quant_ba_sscs))
 
-        ### get tags of the SSCS which form a DCS
+        # get tags of the SSCS which form a DCS
         # group large family sizes
         bigFamilies = numpy.where(quant > 20)[0]
         quant[bigFamilies] = 22
@@ -139,9 +136,9 @@
         plt.text(0.55, 0.14, legend1, size=11, transform=plt.gcf().transFigure)
         plt.text(0.88, 0.14, legend2, size=11, transform=plt.gcf().transFigure)
 
-## data make DCS
+# data make DCS
         tag_consensus, fs_consensus = readFasta(makeConsensus)
-        ### group large family sizes in the plot of fasta files
+        # group large family sizes in the plot of fasta files
         bigFamilies = numpy.where(fs_consensus > 20)[0]
         fs_consensus[bigFamilies] = 22
         list1.append(fs_consensus)
@@ -152,7 +149,7 @@
         plt.text(0.55, 0.11, legend3, size=11, transform=plt.gcf().transFigure)
         plt.text(0.88, 0.11, legend4, size=11, transform=plt.gcf().transFigure)
 
-### data after trimming
+# data after trimming
         if afterTrimming != str(None):
             tag_trimming, fs_trimming = readFasta(afterTrimming)
             bigFamilies = numpy.where(fs_trimming > 20)[0]
@@ -165,10 +162,10 @@
             plt.text(0.55, 0.09, legend5, size=11, transform=plt.gcf().transFigure)
             plt.text(0.88, 0.09, legend6, size=11, transform=plt.gcf().transFigure)
 
-### data of tags aligned to reference genome
+# data of tags aligned to reference genome
         if ref_genome != str(None):
             mut_array = readFileReferenceFree(ref_genome, " ")
-            ### use only unique tags that were alignment to the reference genome
+            # use only unique tags that were alignment to the reference genome
             seq_mut, seqMut_index = numpy.unique(numpy.array(mut_array[:, 1]), return_index=True)
 
             # get family sizes
@@ -194,7 +191,6 @@
             plt.text(0.55, 0.07, legend7, size=11, transform=plt.gcf().transFigure)
             plt.text(0.88, 0.07, legend8, size=11, transform=plt.gcf().transFigure)
 
-
         counts = plt.hist(list1, bins=range(-1, maximumX + 1), stacked=False, label=labels, color=colors,
                           align="left", alpha=1, edgecolor="black", linewidth=1)
         ticks = numpy.arange(0, maximumX, 1)
@@ -248,60 +244,41 @@
 
         if afterTrimming == str(None) and ref_genome == str(None):
             if afterTrimming == str(None):
-                output_file.write(
-                "{}before SSCS buidling{}after DCS building\n".format(sep, sep))
+                output_file.write("{}before SSCS buidling{}after DCS building\n".format(sep, sep))
             elif ref_genome == str(None):
-                output_file.write(
-                    "{}before SSCS building{}atfer DCS building\n".format(sep, sep))
+                output_file.write("{}before SSCS building{}atfer DCS building\n".format(sep, sep))
 
-            for fs, sscs, dcs in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])],
-                                                      counts[0][1][2:len(counts[0][1])]):
+            for fs, sscs, dcs in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], counts[0][1][2:len(counts[0][1])]):
                 if fs == 21:
                     fs = ">20"
                 else:
                     fs = "={}".format(fs)
-                output_file.write(
-                    "FS{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs)))
-            output_file.write(
-                "sum{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1]))))
+                output_file.write("FS{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs)))
+            output_file.write("sum{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1]))))
 
         elif afterTrimming == str(None) or ref_genome == str(None):
             if afterTrimming == str(None):
-                output_file.write(
-                "{}before SSCS buidling{}after DCS building{}after alignment to reference\n".format(sep, sep, sep))
+                output_file.write("{}before SSCS buidling{}after DCS building{}after alignment to reference\n".format(sep, sep, sep))
             elif ref_genome == str(None):
-                output_file.write(
-                    "{}before SSCS building{}atfer DCS building{}after trimming\n".format(sep, sep, sep))
+                output_file.write("{}before SSCS building{}atfer DCS building{}after trimming\n".format(sep, sep, sep))
 
-            for fs, sscs, dcs, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])],
-                                                      counts[0][1][2:len(counts[0][1])],counts[0][2][2:len(counts[0][2])]):
+            for fs, sscs, dcs, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], counts[0][1][2:len(counts[0][1])], counts[0][2][2:len(counts[0][2])]):
                 if fs == 21:
                     fs = ">20"
                 else:
                     fs = "={}".format(fs)
-                output_file.write(
-                    "FS{}{}{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs), sep, int(reference)))
-            output_file.write(
-                "sum{}{}{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])), sep, int(sum(counts[0][2]))))
+                output_file.write("FS{}{}{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs), sep, int(reference)))
+            output_file.write("sum{}{}{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])), sep, int(sum(counts[0][2]))))
 
         else:
-            output_file.write(
-                "{}before SSCS building{}after DCS building{}after trimming{}after alignment to reference\n".format(
-                    sep, sep, sep, sep))
-            for fs, sscs, dcs, trim, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])],
-                                                      counts[0][1][2:len(counts[0][1])],
-                                                      counts[0][2][2:len(counts[0][2])],
-                                                      counts[0][3][2:len(counts[0][3])]):
+            output_file.write("{}before SSCS building{}after DCS building{}after trimming{}after alignment to reference\n".format(sep, sep, sep, sep))
+            for fs, sscs, dcs, trim, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], counts[0][1][2:len(counts[0][1])], counts[0][2][2:len(counts[0][2])], counts[0][3][2:len(counts[0][3])]):
                 if fs == 21:
                     fs = ">20"
                 else:
                     fs = "={}".format(fs)
-                output_file.write(
-                    "FS{}{}{}{}{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs), sep, int(trim), sep,
-                                                    int(reference)))
-            output_file.write(
-                "sum{}{}{}{}{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])), sep,
-                                               int(sum(counts[0][2])), sep, int(sum(counts[0][3]))))
+                output_file.write("FS{}{}{}{}{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs), sep, int(trim), sep, int(reference)))
+            output_file.write("sum{}{}{}{}{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])), sep, int(sum(counts[0][2])), sep, int(sum(counts[0][3]))))
 
         output_file.write("\n\nIn the plot, the family sizes of ab and ba strands and of both duplex tags were used.\nWhereas the total numbers indicate only the single count of the formed duplex tags.\n")
         output_file.write("total nr. of tags (unique, FS>=1){}{}\n".format(sep, len(seq_unique_FS)))
@@ -316,5 +293,6 @@
 
         print("Files successfully created!")
 
+
 if __name__ == '__main__':
-  sys.exit(compare_read_families_read_loss(sys.argv))
+    sys.exit(compare_read_families_read_loss(sys.argv))
--- a/fsd_beforevsafter.xml	Wed May 23 15:04:39 2018 -0400
+++ b/fsd_beforevsafter.xml	Mon Oct 08 05:55:14 2018 -0400
@@ -1,90 +1,78 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<tool id="fsd_beforevsafter" name="Duplex Sequencing Analysis: fsd_beforevsafter" version="0.0.9">
+<tool id="fsd_beforevsafter" name="FSD Before/After" version="1.0.0">
+    <description>: Family Size Distribution of duplex sequecning tags during DuNovo analysis</description>
     <requirements>
 		<!-- galaxy version 16.04 -->
         <requirement type="package" version="2.7">python</requirement>
         <requirement type="package" version="1.4">matplotlib</requirement>
         <requirement type="package" version="1.71">biopython</requirement>
-        
     </requirements>
-    <description>Family size distribution (FSD) of tags from various steps of the Du Novo pipeline</description>
     <command>
-        python2 $__tool_directory__/fsd_beforevsafter.py --inputFile_SSCS "$file1" --inputName1 "$file1.name" --makeDCS "$makeDCS" --afterTrimming "$afterTrimming" --alignedTags "$alignedTags" --sep $separator --output_pdf $output_pdf --output_csv $output_csv 
+        python2 '$__tool_directory__/fsd_beforevsafter.py' --inputFile_SSCS '$file1' --inputName1 '$file1.name' --makeDCS '$makeDCS' --afterTrimming '$afterTrimming' --alignedTags '$alignedTags' --output_pdf $output_pdf --output_tabular $output_tabular
     </command>
     <inputs>
         <param name="file1" type="data" format="tabular" label="Dataset 1: input tags of whole dataset" optional="false" help="Input in tabular format with the family size, tags and the direction of the strand ('ab' or 'ba') for each family."/>
         <param name="makeDCS" type="data" format="fasta" label="Dataset 2: tags after making DCSs" help="Input in fasta format with the tags of the reads, which were aligned to DCSs, and their family sizes of both strands (reverse and forward) in the header, as well as the read itself in the next line."/>
         <param name="afterTrimming" type="data" format="fasta" optional="true" label="Dataset 3: tags after trimming" help="Input in fasta format with the tags of the reads, which were not filtered out after trimming, and their family sizes of both strands (reverse and forward) in the header, as well as the read itself in the next following line."/>
-        <param name="alignedTags" type="data" format="txt" optional="true" label="Dataset 4: input tags aligned to the reference genome" help="Input in txt format with the regions and the tags, which were aligned to the reference genome."/>
-        <param name="separator" type="text" label="Separator of the CSV file." help="can be a single character" value=","/>
+        <param name="alignedTags" type="data" format="txt" optional="true" label="Dataset 4: input tags aligned to the reference genome" help="Input in txt format with the regions of the reference genome and the tags, which were aligned to the reference genome."/>
     </inputs>
     <outputs>
         <data name="output_pdf" format="pdf" />
-        <data name="output_csv" format="csv"/>
+        <data name="output_tabular" format="tabular"/>
     </outputs>
-    <help> <![CDATA[
+    <tests>
+        <test>
+            <param name="file1" value="Test_data.tabular"/>
+            <param name="makeDCS" value="Test_data_DCS.fasta"/>
+            <param name="afterTrimming" value="Test_data_trimming.fasta"/>
+            <param name="alignedTags" value="Test_data_regions.txt"/>
+            <output name="output_pdf" file="output_file.pdf" lines_diff="183"/>
+            <output name="output_tabular" file="output_file.tabular"/>
+        </test>
+    </tests>
+    <help><![CDATA[
 
 **What it does**
         
-    This tool will create a distribution of family sizes of various datasets obtained from different steps of the Du Novo pipeline. 
+This tool will create a distribution of family sizes of various datasets obtained from different steps of the Du Novo pipeline. 
+
+-----
                
 **Input**
         
-    **Dataset 1:** This tools expects a tabular file with the tags of all families, their sizes and information about forward (ab) and reverse (ba) strands. 
-    
-    +-----+----------------------------+----+
-    | 1   | AAAAAAAAAAAATGTTGGAATCTT   | ba |
-    +-----+----------------------------+----+
-    | 10  | AAAAAAAAAAAGGCGGTCCACCCC   | ab |
-    +-----+----------------------------+----+
-    | 28  | AAAAAAAAAAATGGTATGGACCGA   | ab |
-    +-----+----------------------------+----+
-    
-    
-    
-    **Dataset 2:** And a fasta file with all tags and their family sizes of both strands (forward and reverse) in the header and the read itself in the next line is required. This input file can be obtained by Du Novo: make consensus reads. 
-    
-    
-    **Dataset 3 (optional):** In addition, the fasta file with all tags, which were not filtered out after trimming, can be given.
-    For both input files, only one file from both tools are necessary (these tools give for both forward and reverse strands an output file), since both files have the same tags and family sizes, but different reads, which are not required in this tool.
+**Dataset 1:** This tools expects a tabular file with the tags of all families, their sizes and information about forward (ab) and reverse (ba) strands.:: 
+
+      1  AAAAAAAAAAAATGTTGGAATCTT ba
+     10  AAAAAAAAAAAGGCGGTCCACCCC ab
+     28  AAAAAAAAAAATGGTATGGACCGA ab
+          
+**Dataset 2:** A fasta file with all tags and their family sizes of both strands (forward and reverse) in the header and the read itself in the next line is required. This input file can be obtained by the tool "Du Novo: make consensus reads". 
     
-    +-------------------------------------------+
-    | >AAAAAAAAATAGATCATAGACTCT 7-10            |
-    |                                           |
-    | CTAGACTCACTGGCGTTACTGACTGCGAGACCCTCCACGTG |
-    +-------------------------------------------+
-    | >AAAAAAAAGGCAGAAGATATACGC 11-3            |
-    |                                           |
-    | CNCNGGCCCCCCGCTCCGTGCACAGACGNNGCNACTGACAA |
-    +-------------------------------------------+
-    
-    
-    
+**Dataset 3 (optional):** In addition, the fasta file with all tags, which were not filtered out after trimming, can be given. This file can be obtained by the tool "Sequence Content Trimmer".
+For both input files, only one file from both tools are necessary (these tools give for both forward and reverse strands an output file), since both files have the same tags and family sizes, but different reads, which are not required in this tool::
+ 
+  >AAAAAAAAATAGATCATAGACTCT 7-10
+  CTAGACTCACTGGCGTTACTGACTGCGAGACCCTCCACGTG
+  >AAAAAAAAGGCAGAAGATATACGC 11-3
+  CNCNGGCCCCCCGCTCCGTGCACAGACGNNGCNACTGACAA
 	
-    **Dataset 4 (optional):** Finally, a TXT file with the regions and all tags that were aligned to the reference genome can be given as input. This file can obtained from "Duplex Sequencing Analysis: range2tag"
+**Dataset 4 (optional):** Finally, a TXT file with the regions and all tags that were aligned to the reference genome can be given as input. This file can be obtained by the tool "Duplex Sequencing Analysis: range2tag"::
 
-    +-----------+------------------------------+
-    | 87_636    | AAATCAAAGTATGAATGAAGTTGCCT   |
-    +-----------+------------------------------+
-    | 87_636    | AAATTCATAGCATTAATTTCAACGGG   |
-    +-----------+------------------------------+
-    | 656_1143  | GGGGCAGCCATATTGGCAATTATCAT   |
-    +-----------+------------------------------+
-   
+ 87_636   AAATCAAAGTATGAATGAAGTTGCCT
+ 87_636   AAATTCATAGCATTAATTTCAACGGG
+ 656_1143 GGGGCAGCCATATTGGCAATTATCAT
+
 **Output**
         
-    The output is a PDF file with the plot and a CSV with the data of the plot.
-        
-        
+The output is a PDF file with the plot and a tabular file with the data of the plot.
+         
 **About Author**
         
-    Author: Monika Heinzl
-    
-    Department: Institute of Bioinformatics, Johannes Kepler University Linz, Austria
-    
-    Contact: monika.heinzl@edumail.at
+Author: Monika Heinzl
+Department: Institute of Bioinformatics, Johannes Kepler University Linz, Austria
+Contact: monika.heinzl@edumail.at
         
-        ]]> 
+]]> 
 
     </help>
     <citations>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Test_data.tabular	Mon Oct 08 05:55:14 2018 -0400
@@ -0,0 +1,32 @@
+10	AAAAAACATCCCAATAAGAAATCA	ab
+9	AAAAAACATCCCAATAAGAAATCA	ba
+4	AAAAAAGTCCTTCGACTCAAGCGG	ab
+5	AAAAAAGTCCTTCGACTCAAGCGG	ba
+5	AAAAAATAGTTAAGCCGACACACT	ab
+7	AAAAAATAGTTAAGCCGACACACT	ba
+7	AAAAAATGTGCCGAACCTTGGCGA	ab
+10	AAAAAATGTGCCGAACCTTGGCGA	ba
+7	AAAAACAACATAGCTTGAAAATTT	ab
+4	AAAAACAACATAGCTTGAAAATTT	ba
+81	ATTCGGATAATTCGACGCAACATT	ab
+11	ATTCGGATAATTCGACGCAACATT	ba
+41	ATTCGTCGACAATACAAAGGGGCC	ab
+226	ATTCGTCGACAATACAAAGGGGCC	ba
+6	ATTGCCAGTGTGGGCTGGTTAGTA	ab
+41	ATTGCCAGTGTGGGCTGGTTAGTA	ba
+50	ATTTCGCGACCATCCGCCACTTTG	ab
+332	ATTTCGCGACCATCCGCCACTTTG	ba
+64	CAAACTTTAGCACAGTGTGTGTCC	ab
+57	CAAACTTTAGCACAGTGTGTGTCC	ba
+85	ATAAACGGCCTTCGACATTGTGAC	ab
+15	ATAAACGGCCTTCGACATTGTGAC	ba
+11	ATAAAGTCACCTGTGAATACGTTG	ab
+35	ATAAAGTCACCTGTGAATACGTTG	ba
+83	ATAAATCGAAACCGTGCCCAACAA	ab
+63	ATAAATCGAAACCGTGCCCAACAA	ba
+9	ATTTAGATATTTTCTTCTTTTTCT	ab
+7	ATTTAGATATTTTCTTCTTTTTCT	ba
+7	ATTTAGTTATCCGTCGGCGACGAA	ab
+3	ATTTAGTTATCCGTCGGCGACGAA	ba
+8	ATTTAGTTTGAATTGCCCTGCGTC	ab
+9	ATTTAGTTTGAATTGCCCTGCGTC	ba
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Test_data_DCS.fasta	Mon Oct 08 05:55:14 2018 -0400
@@ -0,0 +1,50 @@
+>AAAAAAGGACCCTACCACCAACGT 8-5
+CTAGGGTACTTTGGGGCACGAAACATTCTAAAAATCTTCATTCAATGCTGGTGGAAGTCAGAACGCCCCCCCTTCTGGCCCAGCACTGACCCCCGGCTGTACCTCCACGCCCTGTCGCCCACGCGGCGCCAACCTGCCCCTGCTGACCCAAGCAGGTGTCCCTGGNGTCCAACGCGTCCATGAGCTNCNACNCNCCACTGGTGCGCNNCGCNNGNCTNNNNNCAGNNNANNNCCNCANNNNNNCCNNNNNCNNNNNNNNNNNNNNNCCNNNNNNNNNNNNNN
+>AAAAAAGGATTCCAAATCTCTGGA 3-7
+TACTCCATGCCCCGGGCCACCTGGTAGGCACAGGACACCAGGTCCTTGAAGGTGAGCTGCTCCTCGGGCGGCTTGCAGGTGTCGAAGGAGTAGTCCAGGCCCGGGGGCCGCCGCGCCCGCAGAAACTCCCGCAGGTTACCCTTGGCCGCGTACTCCACCAGCACGTACAGGGNCCCTGGGGACACGGGCTCCTCAGACGGGNTGCCAGGCNCNGGAGGNCCGCNCAGCCGGNNNCCACCGCNNSNNNCCNNNNCCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNTNN
+>AAAAAAGGCCAGTTTAAAAAAACT 37-3
+CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTNGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCNNNTGGGCCTGGCANCCCNNCNGAGGAGCCNGNNNCCNCAGGTCCCCTGTACNNNCTNGTG
+>AAAAAAGGCCCAGATCTCTTAAAA 194-31
+CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCTCCTGGGCCTGGCAGCCCGTCTGAGGAGCCCGTGTCCCCAGGGCCCCTGTACGTGCTGGTGGAGTACGCGGCCAAGGGTAACCTGCGGGAGTTTCAGTCATTTTAAG
+>AAAAAAGGGGCCTCATGCGTCAGT 4-3
+CTAGGCTCTACATGGTGAGCAGAGACGAGGAGAGGGGAGCCCGCCTGGCTGCAGAGAGGGCTCACACAGCCCAGGACCAGCGTGGGCCGAGGTGGGGCTCCAGGAGGCCTGGCGGGCAGGCAGCTCAGAACCTGGTATCTACTTTCTGTTACCTGTCGCTTGAGCNGGAAGNGGGAGANCTTGTGCACGGTGGNNGANCCNAGGCCTTNCTTGGGGGGNNTGCGNNNNNNNNNNNNNNNNNCNNNNNNNNNNNNNNGGNNNANNNGNNNNNNNNNNNNNNNNT
+>AAAAAAGTCCTTCGACTCAAGCGG 4-5
+GATCCTGCCGTGTGGACTCTGTGCGGTGCCCGCAGGGCGGTGCTGGCGCTCGCCTATCGCTCTGCTCTCTCTTTGTAGACGGCGGGCGCTAACACCACCGACAAGGAGCTAGAGGTTCTCTCCTTGCACAACGTCACCTTTGAGGACGCCGGGGAGTACACCTGCCTGGCGGGCAATTCTATTGGGTTTTCTCATCACTCTGCGTGGCTGGTGGTNCTGCCAGGTACCGGCNTCTNCTGCTGCTGNNGNNCNNCNNTNNCNNNNNNNNNNTNNCNNNNNNNNNNNN
+>AAAAAAGTGGGATCGGGTTGCAGC 11-6
+CTGGGGTCCTGGCTCTGCCCAGTTCCCGCCTCCACCCCTGAAGCCTGAGCTCTGCAGGACACGTACACGTCACTCTGGTGAGTGTAGACTCNGTCAAACAAGGCCTCAGGCGCCATCCACTTCACGGGCAGCCGGCCCTGGGAGGGTGTGGGAAGGCGGTGTTGGCGCCAGGCGTCCTACTGGCATGACCCCCACCCCCGCNCCCCAGGGCCGGGCNCACGTTGGTTGTCTTCTTGNANTAGTCNNNNTNGTGCNCGTNNNNNNNCNNNNNNNNNNNNNCNNNN
+>AAAAAATAATTTCGCCCTCGAGTA 16-4
+CTAGGGTACTTTGGGGCACGAAACATTCTAAAAATCTTCATTCAATGCTGGTGGAAGTCAGAACGCCCCCCCTTCTGGCCCAGCACTGACCCCCGGCTGTACCTCCACGCCCTGTCGCCCACGCGGCGCCAACCTGCCCCTGCTGACCCAAGCAGGTGTCCCTGGAGTCCAACGCGTCCATGAGCTCCANCNCACCACTGGNNNGNNTCNCANNGCTNTCNNCAGNNNNNGGCCNNNNNCTGNNCNANNTCNCC
+>AAAAAATAGTTAAGCCGACACACT 5-7
+GATCCTGCCGTGTGGACTCTGTGCGGTGCCCGCAGGGCGGTGCTGGCGCTCGCCTATCGCTCTGCTCTCTCTTTGTAGACGGCGGGCGCTAACACCACCGACAAGGAGCTAGAGGTTCTCTCCTTGCACAACGTCACCTTTGAGGACGCCGGGGNGTACACCTGCCTGGCGGGCAATTCTATNGGGTTTTCTCATCACTCTGCGTGGNNGGTGGTGCTGCCAGGTACCGGCNTCTGCTGCTNNNGCTGNNCCGCNNNNNNNNNNNNNNNNNGNNNNNNNNNNCN
+>AAAAAATCAGAATAGAGCGCTTTC 4-3
+GATCCTGCCGTGTGGACTCTGTGCGGTGCCCGCAGGGCGGTGCTGGCGCTCGCCTATCGCTCTGCTCTCTCTTTGTAGACGGCGGGCGCTAACACCACCGACAAGGAGCTAGAGGTTCTCTCCTTGCACAACGTCACCTTTGNGGACGCCGGGGAGTANACCTGCCTGGCGGGCAATTCTATTGGGTTTTCTCATCACTCTGCGTGNCTGGTGGNGCTGCCNGNNNCCNNNNNCTNNNNNNNNNGCTNNNNNNNNNTNNNNNNNNNNNNNNNNNCTNNCNNNNNNNN
+>AAAAAATCATAAACATTTTAACAA 65-21
+CTAGGGTACTTTGGGGCACGAAACATTCTAAAAATCTTCATTCAATGCTGGTGGAAGTCAGAACGCCCCCCCTTCTGGCCCAGCACTGACCCCCGGCTGTACCTCCACGCCCTGTCGCCCACGTGGCGCCAACCTGCCCCTGCTGACCCAAGCAGGTGTCCCTGGAGTCCAACGCGTCCATGAGCTCCAACACACCACTGGTGCGCATCGCAAGGCTGTCCTCAGGGGAGGGCCCCACGCTGGCCAATGTCTCCNNNCNNGNGCNGCCTGCCNNNNNCAAANGG
+>AAAAAATCTTGACTCGGTACACAA 9-3
+CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGMCGGCTGGGCGGCCCTCCTGGGCCTGGCAGCCCNNCNGAGGAGCCNNNNTCNCCAGGNCCNNTGNANNNNCNGGNGGAGTANNNGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+>AAAAAATGACGAACGATTCGTCAT 3-6
+CTAGAGGGCCAGACCCTGGAGAGAAGGAGCCCAGCAGAGCCAGCCAGTCCCACACCGCCACCAGGCGCCCGGGAGACACCAGAGCCACAGGAGAGGCCTTTGGGGACCCAGATGGGAAGTGGGCTCGAGGGGGCTGAGGGGGCCCCTCTGGGACCAGGACCGGGCCAGGCCAACTTTGTCCCCACNNTGGGCACAGGGNCAGGAGNNNNNGCNCAAGNANNNNNNNNNNNNNNNNNNTCNNNNNNNNNNNCNNNNNNNNNNKNNNNNNNNNNNNNNNNNNNCNNNNNNNN
+>AAAAAATGTGCCGAACCTTGGCGA 7-10
+CTAGAGGGCCAGACCCTGGAGAGAAGGAGCCCAGCAGAGCCAGCCAGTCCCACACCGCCACCAGGCGCCCGGGAGACACCAGAGCCACAGGAGAGGCCTTTGGGGACCCAGATGGGAAGTGGGCTCGAGGGGGCTGAGGGGGCCCCTCTGGGACCAGGACCGGGCCAGGCCAACTTTGTCCCCACACTGGGCACAGGGCCAGGAGTGAGGGCTCAAGAAGCGGGNNGNNNGNAAGTCNCAGGATTNNNNNCNNNNNTNNNANNTTTGGCNNNNNNNNNNNNANN
+>AAAAAATTGAATCCGTGGATATAG 3-8
+CTAGACTCACTGGCGTTACTGACTGCGAGACCCTCCAGACAAGGCGCGTGCTGAGGTTCTGAGCCCCCTTCCGCTCCCAGTGGTGCCTGCGGCTCTGGGCCAGGGGCATCCATGGGAGCCCCGTGGGGGGGGGGNNCCNNNCCNGGCCNNAACGCCCATNTCTTTNCNGCNNNGNNNGANCNGGTGGAGGNNNGACNAGGCGGGNNGTGNNTNNNNMNGNCNNNNNNNNNNNNGNNNNGNGNNNNNNNNNTGTNNNTCMNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+>AAAAAATTGCATGCATCGTCCCTG 6-9
+CTAGGGTACTTTGGGGCACGAAACATTCTAAAAATCTTCATTCAATGCTGGTGGAAGTCAGAACGCCCCCCCTTCTGGCCCAGCACTGACCCCCGGCTGTACCTCCACGCCCTGTCGCCCACGCGGCGCCAACCTGCCCCTGCTGACCCAAGCAGGTGTCCCTGGAGTCCAACGCGTCCATGAGCTCCAACACACCACTGGTGCGNATCGCAAGGCTGTCCTCAGGGGNGGNCCNCNNNNTGNCNNATNNCNCCGNGCNNNNGCNNCCTGCNNNNNNCNNNN
+>AAAAAATTGGCATTGTGTATGCAT 18-5
+CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCTCCTGGGCCTGGCNGNNCNNNNGNNNANNCCGTGNCCCCAGGGNCNNTGNNNNNNNNGGTNNNNNACGNNNNCNANNNTANCCNNNNNNNGT
+>AAAAAATTTTCCCACCAAAATTTC 18-3
+CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTNNNGGNTNGGCGGCCCTCCTGGGCCTGGCNGCNCGTCTGAGGNNCCNNNGTCNNCAGGNNNNCNNTNNNNNNNGGNGNNNNANNNNGNNNNGNNNNNNNNNNNNNNNNNNCNGNNNNNNNNNNNN
+>AAAAAATTTTTCTTTACCACCTGT 4-4
+CTAGACTCACTGGCGTTACTGACTGCGAGACCCTCCAGACAAGGCGCGTGCTGAGGTTCTGAGCCCCCTTCCGCTCCCAGTGGTGCCTGCGGCTCTGGGCCAGGGGCATCCATGGGAGCCCCGTGGGGGGGGGGGCCNGGCNNNNCNNNAACGCCCANNNTNTTTNNANCNNNNGNGNNNNNGNNNNNNNSNNNCNNNNNNNNNNGNNNNNNNNNNNNNCNNNNNNNNCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+>AAAAACAACATAGCTTGAAAATTT 7-4
+CTAGAGGGCCAGACCCTGGAGAGAAGGAGCCCAGCAGAGCCAGCCAGTCCCACACCGCCACCAGGCGCCCGGGAGACACCAGAGCCACAGGAGAGGCCTTTGGGGACCCAGATGGGAAGTGGGCTCGAGGGGGCTGAGGGGGCCCCTCTGGGACCAGGACCGGGCCAGGCCAACTTTGTCCCCACACTGGGCNCAGGGCCAGGAGTGAGGGCNCNAGNAGCNGGACGGNNGTAAGTNNNNNNANTNNNNNNNNTNNNNGCNNNNNTNNNNNNNCNNNNNNNAGN
+>AAAAACAACCAACGTTCTATCTCT 18-4
+CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCTCCTGGGCCTGGCAGCCCNNNNGNGGNNNNNGNGTCCCCAGGGCCCCTGNNNNNNNNGGNNGNNNNNNNGGNCNNGNNNNNCNTGNNNNANNNNNNNNNNNNNNNNNNN
+>AAAAACAAGATAATTGGCGCCCGT 5-22
+CTGCCATACACCCGTCCCAGGAGCATGTCCACAGAACCCCAGCCACACCCAACATCCGCCACATCCCTGACGGCCCCTAAACCCAGCCGGGCCTCTGACTGGTGGCTGTTTCACCCCCACCACCAAGCCCCCTACAGCCAACGCTGGCCCTCAGCACCACTGACCGGGCCCGAGACAGCTCCCATTTGGGGTCGGCAGGCAGCTCGAGCTCGGANACATTGGCCAGCGTGGNGNNNNNCCCNNNGNNCNNCCNTNNNNNNYNNNNNANNNNNNNNNNNNNNNMNNN
+>AAAAACAAGCATCTGTCGACACTA 69-60
+TGCCGCCTGCGCAGCCCCCCCAAGAAAGGCCTGGGCTCCCCCACCGTGCACAAGATCTCCCGCTTCCCGCTCAAGCGACAGGTAACAGAAAGTAGATACCAGGTTCTGAGCTGCCTGCCCGCCAGGCCTCCTGGAGCCCCACCTCGGCCCACGCTGGTCCTGGGCTGTGTGAGCCCTCTCTGCAGCCAGGCGGGCTCCCCTCTCCTCGTCTCTGCTCACCATGTAGAGCCTAGGGAGTCATAGTGTCGACAG
+>AAAAACAATCTTAACCGCGATCTA 10-4
+CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCTCCTGGGCCTGGCAGCCCGTCTGAGGNGCCNGNNTCCCCNGGTCCCCTGTNNGTGCTGGNGGANTACNNGGNCNAGGGTNNCCNGCNNNNGNTNNTGNNNGNNNNNNNN
+>AAAAACACGCGGACTTTCCGCATT 4-7
+ACTCCATGCCCCGGGCCACCTGGTAGGCACAGGACACCAGGTCCTTGAAGGTGAGCTGCTCCTCGGGCGGCTTGCAGGTGTCGAAGGAGTAGTCCAGGCCCGGGGGCCGCCGCGCCCGCAGAAACTCCCGCAGGTTACCCTTGGCCGCGTACTCCACCAGCACGTACAGGGGCCCTGGGGACACGGGCTCCTCAGACGGGCTGCCAGGCCCAGGNGGGCCGCCCAGCCGGCACCACCGCCGCTACCGCNNCTACCNCCNNNNCGTGCNNNNNNCNNNCAGNNNNNN
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Test_data_regions.txt	Mon Oct 08 05:55:14 2018 -0400
@@ -0,0 +1,17 @@
+87_636 AAAAAACATCCCAATAAGAAATCA
+87_636 AAAAAAGTCCTTCGACTCAAGCGG
+87_636 AAAAAATAGTTAAGCCGACACACT
+87_636 AAAAAATGTGCCGAACCTTGGCGA
+87_636 AAAAACAACATAGCTTGAAAATTT
+656_1143 ATTCGGATAATTCGACGCAACATT
+656_1143 ATTCGTCGACAATACAAAGGGGCC
+656_1143 ATTGCCAGTGTGGGCTGGTTAGTA
+656_1143 ATTTCGCGACCATCCGCCACTTTG
+656_1143 CAAACTTTAGCACAGTGTGTGTCC
+1141_1564 ATAAACGGCCTTCGACATTGTGAC
+1141_1564 ATAAAGTCACCTGTGAATACGTTG
+1141_1564 ATAAATCGAAACCGTGCCCAACAA
+1892_2398 ATTTAGATATTTTCTTCTTTTTCT
+1892_2398 ATTTAGTTATCCGTCGGCGACGAA
+1892_2398 ATTTAGTTTGAATTGCCCTGCGTC
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Test_data_trimming.fasta	Mon Oct 08 05:55:14 2018 -0400
@@ -0,0 +1,34 @@
+>AAAAAGAAAAACGATGCTTGACCA 4-6
+CTGGGGTCCTGGCTCTGCCCAGTTCCCGCCTCCACCCCTGAAGCCTGAGCTCTGCAGGACACGTACACGTCACTCTGGTGAGTGTAGACTCGGTCAAACAAGGCCTCAGGCGCCATCCACTTCACGGGCAGCCGGCCCTGGGAGGGTGTGGGAAGGCGGTGTTGGCGCCAGGCGTCCTACTGGCATGACCCCCACCCCCGC
+>AAAAAGAAAAGTTTGCTTTTTCTT 13-17
+CTCCATGCCCCGGGCCACCTGGTAGGCACAGGACACCAGGTCCTTGAAGGTGAGCTGCTCCTCGGGCGGCTTGCAGGTGTCGAAGGAGTAGTCCAGGCCCGGGGGCCGCCGCGCCCGCAGAAACTCCCGCAGGTTACCCTTGGCCGCGTACTCCACCAGCACGTACAGGGGACCTGGGGACACGGGCTCCTCAGACGGGCTGCCAGGCCCAGGAGGGCCGCCCAGCCGGCACCACCGCCGCT
+>AAAAAGAAATGAATTGGTCCTAGA 24-7
+CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCTCCTGGGCCTGGCAGCCCGTNTGAGGAGCC
+>AAAAAGACAGCCTGAATTCCTTGT 17-4
+CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGNGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCNCCTGGGCCTGGCAGCCCGTCNGAG
+>AAAAAGACGATTACACAATAACCT 16-7
+CTAGGGTACTTTGGGGCACGAAACATTCTAAAAATCTTCATTCAATGCTGGTGGAAGTCAGAACGCCCCCCCTTCTGGCCCAGCACTGACCCCCGGCTGTACCTCCACGCCCTGTCGCCCACGTGGCGCCAACCTGCCCCTGCTGACCCAAGCAGGTGTCCCTGGAGTCCAACGCGTCCATGAGCTCCAACACACCACTGGT
+>AAAAAGATACGGGAGGTGAATTGT 75-6
+CTCTGCGTGGCTGGTGGTGCTGCCAGGTACCGGCTTCTGCTGCTGCTGCTGCTCCGCACTGTCTGGGGGACGCTGGCTCGGGACACGCCAAAGCTGCCAGGACGGACGGGAATCCTGTGACTTACGGCCGTCCCGCTTCTTGAGCCCTCACTCCTGGCCCTGTGCCCAGTGTGGGGACAAAGTTGGCCTGGCCCGGTCCTGGTCCCAGAGGGGC
+>AAAAAGATATTTTAATCGGCCCGA 7-6
+GATCCTGCCGTGTGGACTCTGTGCGGTGCCCGCAGGGCGGTGCTGGCGCTCGCCTATCGCTCTGCTCTCTCTTTGTAGACGGCGGGCGCTAACACCACCGACAAGGAGCTAGAGGTTCTMTCCTTGCACAACGTCACCTTTGAGGACGCCGGGGAGTACACCTGCCTGGCGGGCAATTCTATTGGGTTTTCTCATCACTCTGCGTGGCTGGTGGTGCTGC
+>AAAAAGATTACACTGAAATCTTTT 25-5
+CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCTCCTGGGCCTGGCAGCCCNTCTGAGGAGCCCNTGTCCCCAGGGCC
+>AAAAAGCCATATGGTCGAAGAGAT 13-10
+ACTCCATGCCCCGGGCCACCTGGTAGGCACAGGACACCAGGTCCTTGAAGGTGAGCTGCTCCTCGGGCGGCTTGCAGGTGTCGAAGGAGTAGTCCAGGCCCGGGGGCCGCCGCGCCCGCAGAAACTCCCGCAGGTTACCCTTGGCCGCGTACTCCACCAGCACGTACAGGGGACCTGGGGACACGGGCTCCTCAGACGGGCTGCCAGGCCCAGGAGGGCCGCCCAGCCGGCACCACCGCC
+>AAAAAGCGAAAGTGCCCCATATTT 13-16
+CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCTCCTGGGCCTGGCAGCCCNTCTGAGGAGCCCGTGTCCCCAGGGCCCCTGTACGTNCTGGTGG
+>AAAAAGCGATTTAACTGAAATTAT 5-4
+CTAGAGGGCCAGACCCTGGAGAGAAGGAGCCCAGCAGAGCCAGCCAGTCCCACACCGCCACCAGGCGCCCGGGAGACACCAGAGCCACAGGAGAGGCCTTTGGGGACCCAGATGGGAAGTGGGCTCGAGGGGGCTGAGGGGGCCCCTCTGGGACCAGGACCGGGCCAGGCCAACTTTGTCCCCACACTGGGCACAGGGCCAGGAGTGAGGGC
+>AAAAAGCGGGGTGGCCTTACGCCC 17-10
+CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCTCCTGGGCCTGGCAGCCCGTCTGAGGAGCCCGNGTCCCCAGGGCCNCTGTACGT
+>AAAAAGCTCTACCCCCACGAAGCG 5-10
+GATCCTGCCGTGTGGACTCTGTGCGGTGCCCGCAGGGCGGTGCTGGCGCTCGCCTATCGCTCTGCTCTCTCTTTGTAGACGGCGGGCGCTAACACCACCGACAAGGAGCTAGAGGTTCTCTCCTTGCACAACGTCACCTTTGAGGACGCCGGGGAGTACACCTGCCTGGCGGGCAATTCTATTGGGTTTTCTCATCACTCTGCGTGGCTGGTGGTGCTGCCAGGTACCGNCTTCTGCTGCTGCTGC
+>AAAAAGGATATGTCTAACATCCCT 15-16
+CTAGGCTCTACATGGTGAGCAGAGACGAGGAGAGGGGAGCCCGCCTGGCTGCAGAGAGGGCTCACACAGCCCAGGACCAGCGTGGGCCGAGGTGGGGCTCCAGGAGGCCTGGCGGGCAGGCAGCTCAGAACCTGGTATCTACTTTCTGTTACCTGTCGCTTGAGCGGGAAGCGGGAGATCTTGTGCACGGTGGGGGAGCCCAGGCCTTTCTTGGGGGGGCTGCGCAGGCGGCAGAGCGTCACAGCCGCCACCACCAGGATGAACAGGAAGAAGCCCACCCCGT
+>AAAAAGGTACACCCGAGATGAACT 13-9
+CACAGGCCCCCCGCTCCGTGCACAGACGATGCCACTGACAAGGACCTGTCGGACCTGGTGTCTGAGATGGAGATGATGAAGATGATCGGGAAACACAAAAACATCATCAACCTGCTGGGCGCCTGCACGCAGGGCGGTAGGTGCGGTAGCGGCGGTGGTGCCGGCTGGGCGGCCCTCCTGGGCCTGGCAGCCCGTCTGAGGNGCCCGTGTCCCCAGGTCCCCTGTACGTGCTGGTGGAGTAC
+>AAAAAGTAGCTTCGGTTCGGGTCT 12-4
+GATCCTGCCGTGTGGACTCTGTGCGGTGCCCGCAGGGCGGTGCTGGCGCTCGCCTATCGCTCTGCTCTCTCTTTGTAGACGGCGGGCGCTAACACCACCGACAAGGAGCTAGAGGTTCTCTCCTTGCACAACGTCACCTTTGAGGACGCCGGGGANTACACCTGCCTGGCGGGCANTTCTATTGGGTTTTCTCATCACTCTGCGTG
+>AAAAAGTAGGGACATAATTGACTT 4-4
+CTGGGGTCCTGGCTCTGCCCAGTTCCCGCCTCCACCCCTGAAGCCTGAGCTCTGCAGGACACGTACACGTCACTCTGGTGAGTGTAGACTCGGTCAAACAAGGCCTCAGGCGCCATCCACTTCACGGGCAGCCGGCCCTGGGAGGGTGTGGGAAGGCGRTGTTGGCGCCAGGCGTCCTACTGGCATGACCCCCACCCCCGCACCCCA
\ No newline at end of file
Binary file test-data/output_file.pdf has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_file.tabular	Mon Oct 08 05:55:14 2018 -0400
@@ -0,0 +1,46 @@
+Dataset:	Test_data.tabular
+	AB	BA
+max. family size:	85	332
+absolute frequency:	1	1
+relative frequency:	0.062	0.062
+
+
+
+total nr. of reads	1312
+
+
+Values from family size distribution
+	before SSCS building	after DCS building	after trimming	after alignment to reference
+FS=1	0	0	0	0
+FS=2	0	0	0	0
+FS=3	1	8	0	1
+FS=4	2	10	6	2
+FS=5	2	5	3	2
+FS=6	1	3	3	1
+FS=7	5	5	3	5
+FS=8	1	2	0	1
+FS=9	3	2	1	3
+FS=10	2	2	3	2
+FS=11	2	1	0	2
+FS=12	0	0	1	0
+FS=13	0	0	4	0
+FS=14	0	0	0	0
+FS=15	1	0	1	1
+FS=16	0	1	3	0
+FS=17	0	0	3	0
+FS=18	0	3	0	0
+FS=19	0	0	0	0
+FS=20	0	0	0	0
+FS>20	12	8	3	12
+sum	32	50	34	32
+
+
+In the plot, the family sizes of ab and ba strands and of both duplex tags were used.
+Whereas the total numbers indicate only the single count of the formed duplex tags.
+total nr. of tags (unique, FS>=1)	16
+DCS (before SSCS building, FS>=1)	16
+total nr. of tags (unique, FS>=3)	16
+DCS (before SSCS building, FS>=3)	16
+after DCS building	25
+after trimming	17
+after alignment to reference	16