changeset 0:cb08a27e5fc2 draft

Uploaded
author davidvanzessen
date Mon, 29 Aug 2016 05:44:57 -0400
parents
children cbce7f35f8b0
files demultiplex.xml fastqc_v0.11.2.zip fastx_barcode_splitter.pl sff2fastq trim.py wrapper.sh
diffstat 6 files changed, 947 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/demultiplex.xml	Mon Aug 29 05:44:57 2016 -0400
@@ -0,0 +1,349 @@
+<tool id="demultiplex" name="Demultiplex" version="1.0">
+	<description> </description>
+	<requirements>
+		<requirement type="package" version="0.0.13">fastx_toolkit</requirement>
+	</requirements>
+	<command interpreter="bash">
+		wrapper.sh "$input" "$out_file" "$out_file.files_path" "$where" "$mismatches" "$partial" "$input.name"
+		#for $i, $b in enumerate($barcodes)
+            "$b.id"
+            "$b.mid"
+            "$b.trim_start"
+            "$b.trim_end"
+    #end for
+	</command>
+	<inputs>
+		<param name="input" type="data" label="File to split" />
+		<repeat name="barcodes" title="Barcodes" min="1" default="1">
+      <param name="id" type="text" size="50" label="ID" />
+			<param name="mid" type="select" label="Mid">
+				<option value="ACGAGTGCGT">MID-1</option>
+				<option value="ACGCACTCGT">MID-1 reverse complement</option>
+				<option value="ACGCTCGACA">MID-2</option>
+				<option value="TGTCGAGCGT">MID-2 reverse complement</option>
+				<option value="AGACGCACTC">MID-3</option>
+				<option value="GAGTGCGTCT">MID-3 reverse complement</option>
+				<option value="AGCACTGTAG">MID-4</option>
+				<option value="CTACAGTGCT">MID-4 reverse complement</option>
+				<option value="ATCAGACACG">MID-5</option>
+				<option value="CGTGTCTGAT">MID-5 reverse complement</option>
+				<option value="ATATCGCGAG">MID-6</option>
+				<option value="CTCGCGATAT">MID-6 reverse complement</option>
+				<option value="CGTGTCTCTA">MID-7</option>
+				<option value="TAGAGACACG">MID-7 reverse complement</option>
+				<option value="CTCGCGTGTC">MID-8</option>
+				<option value="GACACGCGAG">MID-8 reverse complement</option>
+				<option value="TCTCTATGCG">MID-10</option>
+				<option value="CGCATAGAGA">MID-10 reverse complement</option>
+				<option value="TGATACGTCT">MID-11</option>
+				<option value="AGACGTATCA">MID-11 reverse complement</option>
+				<option value="CATAGTAGTG">MID-13</option>
+				<option value="CACTACTATG">MID-13 reverse complement</option>
+				<option value="CGAGAGATAC">MID-14</option>
+				<option value="GTATCTCTCG">MID-14 reverse complement</option>
+				<option value="ATACGACGTA">MID-15</option>
+				<option value="TACGTCGTAT">MID-15 reverse complement</option>
+				<option value="TCACGTACTA">MID-16</option>
+				<option value="TAGTACGTGA">MID-16 reverse complement</option>
+				<option value="CGTCTAGTAC">MID-17</option>
+				<option value="GTACTAGACG">MID-17 reverse complement</option>
+				<option value="TCTACGTAGC">MID-18</option>
+				<option value="GCTACGTAGA">MID-18 reverse complement</option>
+				<option value="TGTACTACTC">MID-19</option>
+				<option value="GAGTAGTACA">MID-19 reverse complement</option>
+				<option value="ACGACTACAG">MID-20</option>
+				<option value="CTGTAGTCGT">MID-20 reverse complement</option>
+				<option value="CGTAGACTAG">MID-21</option>
+				<option value="CTAGTCTACG">MID-21 reverse complement</option>
+				<option value="TACGAGTATG">MID-22</option>
+				<option value="CATACTCGTA">MID-22 reverse complement</option>
+				<option value="TACTCTCGTG">MID-23</option>
+				<option value="CACGAGAGTA">MID-23 reverse complement</option>
+				<option value="TAGAGACGAG">MID-24</option>
+				<option value="CTCGTCTCTA">MID-24 reverse complement</option>
+				<option value="TCGTCGCTCG">MID-25</option>
+				<option value="CGAGCGACGA">MID-25 reverse complement</option>
+				<option value="ACATACGCGT">MID-26</option>
+				<option value="ACGCGTATGT">MID-26 reverse complement</option>
+				<option value="ACGCGAGTAT">MID-27</option>
+				<option value="ATACTCGCGT">MID-27 reverse complement</option>
+				<option value="ACTACTATGT">MID-28</option>
+				<option value="ACATAGTAGT">MID-28 reverse complement</option>
+				<option value="ACTGTACAGT">MID-29</option>
+				<option value="ACTGTACAGT">MID-29 reverse complement</option>
+				<option value="AGACTATACT">MID-30</option>
+				<option value="AGTATAGTCT">MID-30 reverse complement</option>
+				<option value="AGCGTCGTCT">MID-31</option>
+				<option value="AGACGACGCT">MID-31 reverse complement</option>
+				<option value="AGTACGCTAT">MID-32</option>
+				<option value="ATAGCGTACT">MID-32 reverse complement</option>
+				<option value="ATAGAGTACT">MID-33</option>
+				<option value="AGTACTCTAT">MID-33 reverse complement</option>
+				<option value="CACGCTACGT">MID-34</option>
+				<option value="ACGTAGCGTG">MID-34 reverse complement</option>
+				<option value="CAGTAGACGT">MID-35</option>
+				<option value="ACGTCTACTG">MID-35 reverse complement</option>
+				<option value="CGACGTGACT">MID-36</option>
+				<option value="AGTCACGTCG">MID-36 reverse complement</option>
+				<option value="TACACACACT">MID-37</option>
+				<option value="AGTGTGTGTA">MID-37 reverse complement</option>
+				<option value="TACACGTGAT">MID-38</option>
+				<option value="ATCACGTGTA">MID-38 reverse complement</option>
+				<option value="TACAGATCGT">MID-39</option>
+				<option value="ACGATCTGTA">MID-39 reverse complement</option>
+				<option value="TACGCTGTCT">MID-40</option>
+				<option value="AGACAGCGTA">MID-40 reverse complement</option>
+				<option value="TAGTGTAGAT">MID-41</option>
+				<option value="ATCTACACTA">MID-41 reverse complement</option>
+				<option value="TCGATCACGT">MID-42</option>
+				<option value="ACGTGATCGA">MID-42 reverse complement</option>
+				<option value="TCGCACTAGT">MID-43</option>
+				<option value="ACTAGTGCGA">MID-43 reverse complement</option>
+				<option value="TCTAGCGACT">MID-44</option>
+				<option value="AGTCGCTAGA">MID-44 reverse complement</option>
+				<option value="TCTATACTAT">MID-45</option>
+				<option value="ATAGTATAGA">MID-45 reverse complement</option>
+				<option value="TGACGTATGT">MID-46</option>
+				<option value="ACATACGTCA">MID-46 reverse complement</option>
+				<option value="TGTGAGTAGT">MID-47</option>
+				<option value="ACTACTCACA">MID-47 reverse complement</option>
+				<option value="ACAGTATATA">MID-48</option>
+				<option value="TATATACTGT">MID-48 reverse complement</option>
+				<option value="ACGCGATCGA">MID-49</option>
+				<option value="TCGATCGCGT">MID-49 reverse complement</option>
+				<option value="ACTAGCAGTA">MID-50</option>
+				<option value="TACTGCTAGT">MID-50 reverse complement</option>
+				<option value="AGCTCACGTA">MID-51</option>
+				<option value="TACGTGAGCT">MID-51 reverse complement</option>
+				<option value="AGTATACATA">MID-52</option>
+				<option value="TATGTATACT">MID-52 reverse complement</option>
+				<option value="AGTCGAGAGA">MID-53</option>
+				<option value="TCTCTCGACT">MID-53 reverse complement</option>
+				<option value="AGTGCTACGA">MID-54</option>
+				<option value="TCGTAGCACT">MID-54 reverse complement</option>
+				<option value="CGATCGTATA">MID-55</option>
+				<option value="TATACGATCG">MID-55 reverse complement</option>
+				<option value="CGCAGTACGA">MID-56</option>
+				<option value="TCGTACTGCG">MID-56 reverse complement</option>
+				<option value="CGCGTATACA">MID-57</option>
+				<option value="TGTATACGCG">MID-57 reverse complement</option>
+				<option value="CGTACAGTCA">MID-58</option>
+				<option value="TGACTGTACG">MID-58 reverse complement</option>
+				<option value="CGTACTCAGA">MID-59</option>
+				<option value="TCTGAGTACG">MID-59 reverse complement</option>
+				<option value="CTACGCTCTA">MID-60</option>
+				<option value="TAGAGCGTAG">MID-60 reverse complement</option>
+				<option value="CTATAGCGTA">MID-61</option>
+				<option value="TACGCTATAG">MID-61 reverse complement</option>
+				<option value="TACGTCATCA">MID-62</option>
+				<option value="TGATGACGTA">MID-62 reverse complement</option>
+				<option value="TAGTCGCATA">MID-63</option>
+				<option value="TATGCGACTA">MID-63 reverse complement</option>
+				<option value="TATATATACA">MID-64</option>
+				<option value="TGTATATATA">MID-64 reverse complement</option>
+				<option value="TATGCTAGTA">MID-65</option>
+				<option value="TACTAGCATA">MID-65 reverse complement</option>
+				<option value="TCACGCGAGA">MID-66</option>
+				<option value="TCTCGCGTGA">MID-66 reverse complement</option>
+				<option value="TCGATAGTGA">MID-67</option>
+				<option value="TCACTATCGA">MID-67 reverse complement</option>
+				<option value="TCGCTGCGTA">MID-68</option>
+				<option value="TACGCAGCGA">MID-68 reverse complement</option>
+				<option value="TCTGACGTCA">MID-69</option>
+				<option value="TGACGTCAGA">MID-69 reverse complement</option>
+				<option value="TGAGTCAGTA">MID-70</option>
+				<option value="TACTGACTCA">MID-70 reverse complement</option>
+				<option value="TGTAGTGTGA">MID-71</option>
+				<option value="TCACACTACA">MID-71 reverse complement</option>
+				<option value="TGTCACACGA">MID-72</option>
+				<option value="TCGTGTGACA">MID-72 reverse complement</option>
+				<option value="TGTCGTCGCA">MID-73</option>
+				<option value="TGCGACGACA">MID-73 reverse complement</option>
+				<option value="ACACATACGC">MID-74</option>
+				<option value="GCGTATGTGT">MID-74 reverse complement</option>
+				<option value="ACAGTCGTGC">MID-75</option>
+				<option value="GCACGACTGT">MID-75 reverse complement</option>
+				<option value="ACATGACGAC">MID-76</option>
+				<option value="GTCGTCATGT">MID-76 reverse complement</option>
+				<option value="ACGACAGCTC">MID-77</option>
+				<option value="GAGCTGTCGT">MID-77 reverse complement</option>
+				<option value="ACGTCTCATC">MID-78</option>
+				<option value="GATGAGACGT">MID-78 reverse complement</option>
+				<option value="ACTCATCTAC">MID-79</option>
+				<option value="GTAGATGAGT">MID-79 reverse complement</option>
+				<option value="ACTCGCGCAC">MID-80</option>
+				<option value="GTGCGCGAGT">MID-80 reverse complement</option>
+				<option value="AGAGCGTCAC">MID-81</option>
+				<option value="GTGACGCTCT">MID-81 reverse complement</option>
+				<option value="AGCGACTAGC">MID-82</option>
+				<option value="GCTAGTCGCT">MID-82 reverse complement</option>
+				<option value="AGTAGTGATC">MID-83</option>
+				<option value="GATCACTACT">MID-83 reverse complement</option>
+				<option value="AGTGACACAC">MID-84</option>
+				<option value="GTGTGTCACT">MID-84 reverse complement</option>
+				<option value="AGTGTATGTC">MID-85</option>
+				<option value="GACATACACT">MID-85 reverse complement</option>
+				<option value="ATAGATAGAC">MID-86</option>
+				<option value="GTCTATCTAT">MID-86 reverse complement</option>
+				<option value="ATATAGTCGC">MID-87</option>
+				<option value="GCGACTATAT">MID-87 reverse complement</option>
+				<option value="ATCTACTGAC">MID-88</option>
+				<option value="GTCAGTAGAT">MID-88 reverse complement</option>
+				<option value="CACGTAGATC">MID-89</option>
+				<option value="GATCTACGTG">MID-89 reverse complement</option>
+				<option value="CACGTGTCGC">MID-90</option>
+				<option value="GCGACACGTG">MID-90 reverse complement</option>
+				<option value="CATACTCTAC">MID-91</option>
+				<option value="GTAGAGTATG">MID-91 reverse complement</option>
+				<option value="CGACACTATC">MID-92</option>
+				<option value="GATAGTGTCG">MID-92 reverse complement</option>
+				<option value="CGAGACGCGC">MID-93</option>
+				<option value="GCGCGTCTCG">MID-93 reverse complement</option>
+				<option value="CGTATGCGAC">MID-94</option>
+				<option value="GTCGCATACG">MID-94 reverse complement</option>
+				<option value="CGTCGATCTC">MID-95</option>
+				<option value="GAGATCGACG">MID-95 reverse complement</option>
+				<option value="CTACGACTGC">MID-96</option>
+				<option value="GCAGTCGTAG">MID-96 reverse complement</option>
+				<option value="CTAGTCACTC">MID-97</option>
+				<option value="GAGTGACTAG">MID-97 reverse complement</option>
+				<option value="CTCTACGCTC">MID-98</option>
+				<option value="GAGCGTAGAG">MID-98 reverse complement</option>
+				<option value="CTGTACATAC">MID-99</option>
+				<option value="GTATGTACAG">MID-99 reverse complement</option>
+				<option value="TAGACTGCAC">MID-100</option>
+				<option value="GTGCAGTCTA">MID-100 reverse complement</option>
+				<option value="TAGCGCGCGC">MID-101</option>
+				<option value="GCGCGCGCTA">MID-101 reverse complement</option>
+				<option value="TAGCTCTATC">MID-102</option>
+				<option value="GATAGAGCTA">MID-102 reverse complement</option>
+				<option value="TATAGACATC">MID-103</option>
+				<option value="GATGTCTATA">MID-103 reverse complement</option>
+				<option value="TATGATACGC">MID-104</option>
+				<option value="GCGTATCATA">MID-104 reverse complement</option>
+				<option value="TCACTCATAC">MID-105</option>
+				<option value="GTATGAGTGA">MID-105 reverse complement</option>
+				<option value="TCATCGAGTC">MID-106</option>
+				<option value="GACTCGATGA">MID-106 reverse complement</option>
+				<option value="TCGAGCTCTC">MID-107</option>
+				<option value="GAGAGCTCGA">MID-107 reverse complement</option>
+				<option value="TCGCAGACAC">MID-108</option>
+				<option value="GTGTCTGCGA">MID-108 reverse complement</option>
+				<option value="TCTGTCTCGC">MID-109</option>
+				<option value="GCGAGACAGA">MID-109 reverse complement</option>
+				<option value="TGAGTGACGC">MID-110</option>
+				<option value="GCGTCACTCA">MID-110 reverse complement</option>
+				<option value="TGATGTGTAC">MID-111</option>
+				<option value="GTACACATCA">MID-111 reverse complement</option>
+				<option value="TGCTATAGAC">MID-112</option>
+				<option value="GTCTATAGCA">MID-112 reverse complement</option>
+				<option value="TGCTCGCTAC">MID-113</option>
+				<option value="GTAGCGAGCA">MID-113 reverse complement</option>
+				<option value="ACGTGCAGCG">MID-114</option>
+				<option value="CGCTGCACGT">MID-114 reverse complement</option>
+				<option value="ACTCACAGAG">MID-115</option>
+				<option value="CTCTGTGAGT">MID-115 reverse complement</option>
+				<option value="AGACTCAGCG">MID-116</option>
+				<option value="CGCTGAGTCT">MID-116 reverse complement</option>
+				<option value="AGAGAGTGTG">MID-117</option>
+				<option value="CACACTCTCT">MID-117 reverse complement</option>
+				<option value="AGCTATCGCG">MID-118</option>
+				<option value="CGCGATAGCT">MID-118 reverse complement</option>
+				<option value="AGTCTGACTG">MID-119</option>
+				<option value="CAGTCAGACT">MID-119 reverse complement</option>
+				<option value="AGTGAGCTCG">MID-120</option>
+				<option value="CGAGCTCACT">MID-120 reverse complement</option>
+				<option value="ATAGCTCTCG">MID-121</option>
+				<option value="CGAGAGCTAT">MID-121 reverse complement</option>
+				<option value="ATCACGTGCG">MID-122</option>
+				<option value="CGCACGTGAT">MID-122 reverse complement</option>
+				<option value="ATCGTAGCAG">MID-123</option>
+				<option value="CTGCTACGAT">MID-123 reverse complement</option>
+				<option value="ATCGTCTGTG">MID-124</option>
+				<option value="CACAGACGAT">MID-124 reverse complement</option>
+				<option value="ATGTACGATG">MID-125</option>
+				<option value="CATCGTACAT">MID-125 reverse complement</option>
+				<option value="ATGTGTCTAG">MID-126</option>
+				<option value="CTAGACACAT">MID-126 reverse complement</option>
+				<option value="CACACGATAG">MID-127</option>
+				<option value="CTATCGTGTG">MID-127 reverse complement</option>
+				<option value="CACTCGCACG">MID-128</option>
+				<option value="CGTGCGAGTG">MID-128 reverse complement</option>
+				<option value="CAGACGTCTG">MID-129</option>
+				<option value="CAGACGTCTG">MID-129 reverse complement</option>
+				<option value="CAGTACTGCG">MID-130</option>
+				<option value="CGCAGTACTG">MID-130 reverse complement</option>
+				<option value="CGACAGCGAG">MID-131</option>
+				<option value="CTCGCTGTCG">MID-131 reverse complement</option>
+				<option value="CGATCTGTCG">MID-132</option>
+				<option value="CGACAGATCG">MID-132 reverse complement</option>
+				<option value="CGCGTGCTAG">MID-133</option>
+				<option value="CTAGCACGCG">MID-133 reverse complement</option>
+				<option value="CGCTCGAGTG">MID-134</option>
+				<option value="CACTCGAGCG">MID-134 reverse complement</option>
+				<option value="CGTGATGACG">MID-135</option>
+				<option value="CGTCATCACG">MID-135 reverse complement</option>
+				<option value="CTATGTACAG">MID-136</option>
+				<option value="CTGTACATAG">MID-136 reverse complement</option>
+				<option value="CTCGATATAG">MID-137</option>
+				<option value="CTATATCGAG">MID-137 reverse complement</option>
+				<option value="CTCGCACGCG">MID-138</option>
+				<option value="CGCGTGCGAG">MID-138 reverse complement</option>
+				<option value="CTGCGTCACG">MID-139</option>
+				<option value="CGTGACGCAG">MID-139 reverse complement</option>
+				<option value="CTGTGCGTCG">MID-140</option>
+				<option value="CGACGCACAG">MID-140 reverse complement</option>
+				<option value="TAGCATACTG">MID-141</option>
+				<option value="CAGTATGCTA">MID-141 reverse complement</option>
+				<option value="TATACATGTG">MID-142</option>
+				<option value="CACATGTATA">MID-142 reverse complement</option>
+				<option value="TATCACTCAG">MID-143</option>
+				<option value="CTGAGTGATA">MID-143 reverse complement</option>
+				<option value="TATCTGATAG">MID-144</option>
+				<option value="CTATCAGATA">MID-144 reverse complement</option>
+				<option value="TCGTGACATG">MID-145</option>
+				<option value="CATGTCACGA">MID-145 reverse complement</option>
+				<option value="TCTGATCGAG">MID-146</option>
+				<option value="CTCGATCAGA">MID-146 reverse complement</option>
+				<option value="TGACATCTCG">MID-147</option>
+				<option value="CGAGATGTCA">MID-147 reverse complement</option>
+				<option value="TGAGCTAGAG">MID-148</option>
+				<option value="CTCTAGCTCA">MID-148 reverse complement</option>
+				<option value="TGATAGAGCG">MID-149</option>
+				<option value="CGCTCTATCA">MID-149 reverse complement</option>
+				<option value="TGCGTGTGCG">MID-150</option>
+				<option value="CGCACACGCA">MID-150 reverse complement</option>
+				<option value="TGCTAGTCAG">MID-151</option>
+				<option value="CTGACTAGCA">MID-151 reverse complement</option>
+				<option value="TGTATCACAG">MID-152</option>
+				<option value="CTGTGATACA">MID-152 reverse complement</option>
+				<option value="TGTGCGCGTG">MID-153</option>
+				<option value="CACGCGCACA">MID-153 reverse complement</option>
+			</param>
+			
+			<param name="trim_start" type="integer" size="3" value="0" label="How many nucleotides to trim from the start" />
+		
+			<param name="trim_end" type="integer" size="3" value="0" label="How many nucleotides to trim from the end" />
+		</repeat>
+		
+		<param name="where" type="select" label="Barcodes found at">
+			<option value="bol">Start: 5' end</option>
+			<option value="eol">End: 3' end</option>
+		</param>
+		
+		<param name="mismatches" type="integer" size="3" value="2" label="Max. number of mismatches allowed." />
+		
+		<param name="partial" type="integer" size="3" value="0" label="Allow partial overlap of barcodes." />
+		
+	</inputs>
+	<outputs>
+		<data format="html" name="out_file" />
+	</outputs>
+	<help>
+- Splitting sff or fastq files into FASTQ, FASTA and (optional) trimmed FASTA files with a FASTQC report on the FASTQ file, this tool uses:
+- sff2fastq (https://github.com/indraniel/sff2fastq) to extract a fastq file.
+- fastx_barcode_splitter.pl (http://hannonlab.cshl.edu/fastx_toolkit/commandline.html) to demultiplex.
+- fastqc (http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) to provide analysis of the fastq files.
+		
+	</help>
+</tool>
Binary file fastqc_v0.11.2.zip has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastx_barcode_splitter.pl	Mon Aug 29 05:44:57 2016 -0400
@@ -0,0 +1,472 @@
+#!/usr/bin/perl
+
+#    FASTX-toolkit - FASTA/FASTQ preprocessing tools.
+#    Copyright (C) 2009-2013  A. Gordon (assafgordon@gmail.com)
+#
+#   This program is free software: you can redistribute it and/or modify
+#   it under the terms of the GNU Affero General Public License as
+#   published by the Free Software Foundation, either version 3 of the
+#   License, or (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU Affero General Public License for more details.
+#
+#    You should have received a copy of the GNU Affero General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+use strict;
+use warnings;
+use IO::Handle;
+use Data::Dumper;
+use Getopt::Long;
+use Carp;
+
+##
+## This program splits a FASTQ/FASTA file into several smaller files,
+## Based on barcode matching.
+##
+## run with "--help" for usage information
+##
+## Assaf Gordon <assafgordon@gmail.com> , 11sep2008
+
+# Forward declarations
+sub load_barcode_file ($);
+sub parse_command_line ;
+sub match_sequences ;
+sub mismatch_count($$) ;
+sub print_results;
+sub open_and_detect_input_format;
+sub read_record;
+sub write_record($);
+sub usage();
+
+# Global flags and arguments, 
+# Set by command line argumens
+my $barcode_file ;
+my $barcodes_at_eol = 0 ;
+my $barcodes_at_bol = 0 ;
+my $exact_match = 0 ;
+my $allow_partial_overlap = 0;
+my $allowed_mismatches = 1;
+my $newfile_suffix = '';
+my $newfile_prefix  ;
+my $quiet = 0 ;
+my $debug = 0 ;
+my $fastq_format = 1;
+
+# Global variables 
+# Populated by 'create_output_files'
+my %filenames;
+my %files;
+my %counts = ( 'unmatched' => 0 );
+my $barcodes_length;
+my @barcodes;
+my $input_file_io;
+
+
+# The Four lines per record in FASTQ format.
+# (when using FASTA format, only the first two are used)
+my $seq_name;
+my $seq_bases;
+my $seq_name2;
+my $seq_qualities;
+
+
+#
+# Start of Program
+#
+parse_command_line ;
+
+load_barcode_file ( $barcode_file ) ;
+
+open_and_detect_input_format;
+
+match_sequences ;
+
+print_results unless $quiet;
+
+#
+# End of program
+#
+
+
+
+
+
+
+
+
+sub parse_command_line {
+	my $help;
+
+	usage() if (scalar @ARGV==0);
+
+	my $result = GetOptions ( "bcfile=s" => \$barcode_file,
+				  "eol"  => \$barcodes_at_eol,
+				  "bol"  => \$barcodes_at_bol,
+				  "exact" => \$exact_match,
+				  "prefix=s" => \$newfile_prefix,
+				  "suffix=s" => \$newfile_suffix,
+				  "quiet" => \$quiet, 
+				  "partial=i" => \$allow_partial_overlap,
+				  "debug" => \$debug,
+				  "mismatches=i" => \$allowed_mismatches,
+				  "help" => \$help
+				  ) ;
+	
+	usage() if ($help);
+
+	die "Error: barcode file not specified (use '--bcfile [FILENAME]')\n" unless defined $barcode_file;
+	die "Error: prefix path/filename not specified (use '--prefix [PATH]')\n" unless defined $newfile_prefix;
+
+	if ($barcodes_at_bol == $barcodes_at_eol) {
+		die "Error: can't specify both --eol & --bol\n" if $barcodes_at_eol;
+		die "Error: must specify either --eol or --bol\n" ;
+	}
+
+	die "Error: invalid for value partial matches (valid values are 0 or greater)\n" if $allow_partial_overlap<0;
+
+	$allowed_mismatches = 0 if $exact_match;
+
+	die "Error: invalid value for mismatches (valid values are 0 or more)\n" if ($allowed_mismatches<0);
+
+	die "Error: partial overlap value ($allow_partial_overlap) bigger than " . 
+		"max. allowed mismatches ($allowed_mismatches)\n" if ($allow_partial_overlap > $allowed_mismatches);
+
+
+	exit unless $result;
+}
+
+
+
+#
+# Read the barcode file
+#
+sub load_barcode_file ($) {
+	my $filename = shift or croak "Missing barcode file name";
+
+	open BCFILE,"<$filename" or die "Error: failed to open barcode file ($filename)\n";
+	while (<BCFILE>) {
+		next if m/^#/;
+		chomp;
+		my ($ident, $barcode) = split ;
+
+		$barcode = uc($barcode);
+
+		# Sanity checks on the barcodes
+		die "Error: bad data at barcode file ($filename) line $.\n" unless defined $barcode;
+		die "Error: bad barcode value ($barcode) at barcode file ($filename) line $.\n"
+			unless $barcode =~ m/^[AGCT]+$/;
+
+		die "Error: bad identifier value ($ident) at barcode file ($filename) line $. (must be alphanumeric)\n" 
+			unless $ident =~ m/^\w+$/;
+
+		die "Error: badcode($ident, $barcode) is shorter or equal to maximum number of " .
+		    "mismatches ($allowed_mismatches). This makes no sense. Specify fewer  mismatches.\n" 
+		    	if length($barcode)<=$allowed_mismatches;
+
+		$barcodes_length = length($barcode) unless defined $barcodes_length;
+		die "Error: found barcodes in different lengths. this feature is not supported yet.\n" 
+			unless $barcodes_length == length($barcode);
+
+	 	push @barcodes, [$ident, $barcode];
+
+		if ($allow_partial_overlap>0) {
+			foreach my $i (1 .. $allow_partial_overlap) {
+				substr $barcode, ($barcodes_at_bol)?0:-1, 1, '';
+	 			push @barcodes, [$ident, $barcode];
+			}
+		}
+	}
+	close BCFILE;
+
+	if ($debug) {
+		print STDERR "barcode\tsequence\n";
+		foreach my $barcoderef (@barcodes) {
+			my ($ident, $seq) = @{$barcoderef};
+			print STDERR $ident,"\t", $seq ,"\n";
+		}
+	}
+}
+
+# Create one output file for each barcode.
+# (Also create a file for the dummy 'unmatched' barcode)
+sub create_output_files {
+	my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers;
+	$barcodes{'unmatched'} = 1 ;
+
+	foreach my $ident (keys %barcodes) {
+		my $new_filename = $newfile_prefix . $ident . $newfile_suffix; 
+		$filenames{$ident} = $new_filename;
+		open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n"; 
+		$files{$ident} = $file ;
+	}
+}
+
+sub match_sequences {
+
+	my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers;
+	$barcodes{'unmatched'} = 1 ;
+
+	#reset counters
+	foreach my $ident ( keys %barcodes ) {
+		$counts{$ident} = 0;
+	}
+
+	create_output_files;
+
+	# Read file FASTQ file
+	# split accotding to barcodes
+	while ( read_record ) {
+		chomp $seq_bases;
+
+		print STDERR "sequence $seq_bases: \n" if $debug;
+
+		my $best_barcode_mismatches_count = $barcodes_length;
+		my $best_barcode_ident = undef;
+
+		#Try all barcodes, find the one with the lowest mismatch count
+		foreach my $barcoderef (@barcodes) {
+			my ($ident, $barcode) = @{$barcoderef};
+
+			# Get DNA fragment (in the length of the barcodes)
+			# The barcode will be tested only against this fragment
+			# (no point in testing the barcode against the whole sequence)
+			my $sequence_fragment;
+			if ($barcodes_at_bol) {
+				$sequence_fragment = substr $seq_bases, 0, $barcodes_length;
+			} else {
+				$sequence_fragment = substr $seq_bases, - $barcodes_length;
+			}
+
+			my $mm = mismatch_count($sequence_fragment, $barcode) ; 
+
+			# if this is a partial match, add the non-overlap as a mismatch
+			# (partial barcodes are shorter than the length of the original barcodes)
+			$mm += ($barcodes_length - length($barcode)); 
+
+			if ( $mm < $best_barcode_mismatches_count ) {
+				$best_barcode_mismatches_count = $mm ;
+				$best_barcode_ident = $ident ;
+			}
+		}
+
+		$best_barcode_ident = 'unmatched' 
+			if ( (!defined $best_barcode_ident) || $best_barcode_mismatches_count>$allowed_mismatches) ;
+
+		print STDERR "sequence $seq_bases matched barcode: $best_barcode_ident\n" if $debug;
+
+		$counts{$best_barcode_ident}++;
+
+		#get the file associated with the matched barcode.
+		#(note: there's also a file associated with 'unmatched' barcode)
+		my $file = $files{$best_barcode_ident};
+
+		write_record($file);
+	}
+}
+
+#Quickly calculate hamming distance between two strings
+#
+#NOTE: Strings must be same length.
+#      returns number of different characters.
+#see  http://www.perlmonks.org/?node_id=500235
+sub mismatch_count($$) { length( $_[ 0 ] ) - ( ( $_[ 0 ] ^ $_[ 1 ] ) =~ tr[\0][\0] ) }
+
+
+
+sub print_results
+{
+	print "Barcode\tCount\tLocation\n";
+	my $total = 0 ;
+	foreach my $ident (sort keys %counts) {
+		print $ident, "\t", $counts{$ident},"\t",$filenames{$ident},"\n";
+		$total += $counts{$ident};
+	}
+	print "total\t",$total,"\n";
+}
+
+
+sub read_record
+{
+	$seq_name = $input_file_io->getline();
+
+	return undef unless defined $seq_name; # End of file?
+
+	$seq_bases = $input_file_io->getline();
+	die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases;
+
+	# If using FASTQ format, read two more lines
+	if ($fastq_format) {
+		$seq_name2  = $input_file_io->getline();
+		die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2;
+
+		$seq_qualities = $input_file_io->getline();
+		die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities;
+	}
+	return 1;
+}
+
+sub write_record($)
+{
+	my $file = shift;
+
+	croak "Bad file handle" unless defined $file;
+
+	print $file $seq_name;
+	print $file $seq_bases,"\n";
+
+	#if using FASTQ format, write two more lines
+	if ($fastq_format) {
+		print $file $seq_name2;
+		print $file $seq_qualities;
+	}
+}
+
+sub open_and_detect_input_format
+{
+	$input_file_io  = new IO::Handle;
+	die "Failed to open STDIN " unless $input_file_io->fdopen(fileno(STDIN),"r");
+
+	# Get the first characeter, and push it back
+	my $first_char = $input_file_io->getc();
+	$input_file_io->ungetc(ord $first_char);
+
+	if ($first_char eq '>') {
+		# FASTA format
+		$fastq_format = 0 ;
+		print STDERR "Detected FASTA format\n" if $debug;
+	} elsif ($first_char eq '@') {
+		# FASTQ format
+		$fastq_format = 1;
+		print STDERR "Detected FASTQ format\n" if $debug;
+	} else {
+		die "Error: unknown file format. First character = '$first_char' (expecting > or \@)\n";
+	}
+}
+
+sub usage()
+{
+print<<EOF;
+Barcode Splitter, by Assaf Gordon (gordon\@cshl.edu), 11sep2008
+
+This program reads FASTA/FASTQ file and splits it into several smaller files,
+Based on barcode matching.
+FASTA/FASTQ data is read from STDIN (format is auto-detected.)
+Output files will be writen to disk.
+Summary will be printed to STDOUT.
+
+usage: $0 --bcfile FILE --prefix PREFIX [--suffix SUFFIX] [--bol|--eol] 
+         [--mismatches N] [--exact] [--partial N] [--help] [--quiet] [--debug]
+
+Arguments:
+
+--bcfile FILE	- Barcodes file name. (see explanation below.)
+--prefix PREFIX	- File prefix. will be added to the output files. Can be used
+		  to specify output directories.
+--suffix SUFFIX	- File suffix (optional). Can be used to specify file
+		  extensions.
+--bol		- Try to match barcodes at the BEGINNING of sequences.
+		  (What biologists would call the 5' end, and programmers
+		  would call index 0.)
+--eol		- Try to match barcodes at the END of sequences.
+		  (What biologists would call the 3' end, and programmers
+		  would call the end of the string.)
+		  NOTE: one of --bol, --eol must be specified, but not both.
+--mismatches N	- Max. number of mismatches allowed. default is 1.
+--exact		- Same as '--mismatches 0'. If both --exact and --mismatches 
+		  are specified, '--exact' takes precedence.
+--partial N	- Allow partial overlap of barcodes. (see explanation below.)
+		  (Default is not partial matching)
+--quiet		- Don't print counts and summary at the end of the run.
+		  (Default is to print.)
+--debug		- Print lots of useless debug information to STDERR.
+--help		- This helpful help screen.
+
+Example (Assuming 's_2_100.txt' is a FASTQ file, 'mybarcodes.txt' is 
+the barcodes file):
+
+   \$ cat s_2_100.txt | $0 --bcfile mybarcodes.txt --bol --mismatches 2 \\
+   	--prefix /tmp/bla_ --suffix ".txt"
+
+Barcode file format
+-------------------
+Barcode files are simple text files. Each line should contain an identifier 
+(descriptive name for the barcode), and the barcode itself (A/C/G/T), 
+separated by a TAB character. Example:
+
+    #This line is a comment (starts with a 'number' sign)
+    BC1 GATCT
+    BC2 ATCGT
+    BC3 GTGAT
+    BC4 TGTCT
+
+For each barcode, a new FASTQ file will be created (with the barcode's 
+identifier as part of the file name). Sequences matching the barcode 
+will be stored in the appropriate file.
+
+Running the above example (assuming "mybarcodes.txt" contains the above 
+barcodes), will create the following files:
+	/tmp/bla_BC1.txt
+	/tmp/bla_BC2.txt
+	/tmp/bla_BC3.txt
+	/tmp/bla_BC4.txt
+	/tmp/bla_unmatched.txt
+The 'unmatched' file will contain all sequences that didn't match any barcode.
+
+Barcode matching
+----------------
+
+** Without partial matching:
+
+Count mismatches between the FASTA/Q sequences and the barcodes.
+The barcode which matched with the lowest mismatches count (providing the
+count is small or equal to '--mismatches N') 'gets' the sequences.
+
+Example (using the above barcodes):
+Input Sequence:
+    GATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
+
+Matching with '--bol --mismatches 1':
+   GATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
+   GATCT (1 mismatch, BC1)
+   ATCGT (4 mismatches, BC2)
+   GTGAT (3 mismatches, BC3)
+   TGTCT (3 mismatches, BC4)
+
+This sequence will be classified as 'BC1' (it has the lowest mismatch count).
+If '--exact' or '--mismatches 0' were specified, this sequence would be 
+classified as 'unmatched' (because, although BC1 had the lowest mismatch count,
+it is above the maximum allowed mismatches).
+
+Matching with '--eol' (end of line) does the same, but from the other side
+of the sequence.
+
+** With partial matching (very similar to indels):
+
+Same as above, with the following addition: barcodes are also checked for
+partial overlap (number of allowed non-overlapping bases is '--partial N').
+
+Example:
+Input sequence is ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
+(Same as above, but note the missing 'G' at the beginning.)
+
+Matching (without partial overlapping) against BC1 yields 4 mismatches:
+   ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
+   GATCT (4 mismatches)
+
+Partial overlapping would also try the following match:
+   -ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
+   GATCT (1 mismatch)
+
+Note: scoring counts a missing base as a mismatch, so the final
+mismatch count is 2 (1 'real' mismatch, 1 'missing base' mismatch).
+If running with '--mismatches 2' (meaning allowing upto 2 mismatches) - this 
+seqeunce will be classified as BC1.
+
+EOF
+
+exit 1;
+}
Binary file sff2fastq has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/trim.py	Mon Aug 29 05:44:57 2016 -0400
@@ -0,0 +1,59 @@
+import argparse
+
+#docs.python.org/dev/library/argparse.html
+parser = argparse.ArgumentParser()
+parser.add_argument("--input", help="Input fasta")
+parser.add_argument("--output", help="Output fasta")
+parser.add_argument("--start", help="How many nucleotides to trim from the start", type=int)
+parser.add_argument("--end", help="How many nucleotides to trim from the end", type=int)
+
+args = parser.parse_args()
+start = int(args.start)
+end = int(args.end)
+
+print args.input
+print args.output
+print start
+print end
+
+if end <= 0 and start <= 0:
+	import shutil
+	shutil.copy(args.input, args.output)
+	import sys
+	sys.exit()
+	
+
+
+currentSeq = ""
+currentId = ""
+
+if end is 0:
+	with open(args.input, 'r') as i:
+		with open(args.output, 'w') as o:
+			for line in i.readlines():
+				if line[0] is ">":
+					currentSeq = currentSeq[start:]
+					if currentSeq is not "" and currentId is not "":
+						o.write(currentId)
+						o.write(currentSeq + "\n")
+					currentId = line
+					currentSeq = ""
+				else:
+					currentSeq += line.rstrip()
+			o.write(currentId)
+			o.write(currentSeq[start:] + "\n")
+else:
+	with open(args.input, 'r') as i:
+		with open(args.output, 'w') as o:
+			for line in i.readlines():
+				if line[0] is ">":
+					currentSeq = currentSeq[start:-end]
+					if currentSeq is not "" and currentId is not "":
+						o.write(currentId)
+						o.write(currentSeq + "\n")
+					currentId = line
+					currentSeq = ""
+				else:
+					currentSeq += line.rstrip()
+			o.write(currentId)
+			o.write(currentSeq[start:-end] + "\n")
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/wrapper.sh	Mon Aug 29 05:44:57 2016 -0400
@@ -0,0 +1,67 @@
+#!/bin/bash
+input="$1"
+output="$2"
+outDir="$3"
+mkdir "$outDir"
+EOL="$4"
+mismatches="$5"
+partial="$6"
+name=$(basename "$7")
+ext="${name##*.}"
+name="${name%.*}"
+name="${name// /_}"
+prefix="${name}_"
+dir="$(cd "$(dirname "$0")" && pwd)"
+
+unzip $dir/fastqc_v0.11.2.zip -d $PWD/ > $PWD/unziplog.log
+chmod 755 $PWD/FastQC/fastqc
+
+declare -A trim_start
+declare -A trim_end
+for ((i=8;i<=$#;i=i+4))
+do
+	j=$((i+1))
+	start_int=$((i+2))
+	end_int=$((i+3))
+	id="${!i}"
+	echo "$id, ${start_int}, ${end_int}"
+	trim_start[$id]=${!start_int}
+	trim_end[$id]=${!end_int}
+  echo -e "$id\t${!j}" >> $outDir/barcodes.txt
+  
+done
+trim_start["unmatched"]=0
+trim_end["unmatched"]=0
+
+echo "trim_start = ${trim_start[@]}"
+echo "trim_end = ${trim_end[@]}"
+
+workdir=$PWD
+cd $outDir
+echo "$3"
+filetype=`file $input`
+result=""
+if [[ $filetype == *ASCII* ]]
+then
+	result=`cat $input | $dir/fastx_barcode_splitter.pl --bcfile $outDir/barcodes.txt --prefix "$prefix" --suffix ".fastq" --$EOL --mismatches $mismatches --partial $partial`
+else
+	result=`$dir/sff2fastq $input | $dir/fastx_barcode_splitter.pl --bcfile $outDir/barcodes.txt --prefix "$prefix" --suffix ".fastq" --$EOL --mismatches $mismatches --partial $partial`
+fi
+
+echo "$result" | tail -n +2 | sed 's/\t/,/g' > output.txt
+echo "<html><head><title>$name demultiplex</title></head><body><table border='1'><thead><tr><th>ID</th><th>Count</th><th>FASTQ</th><th>FASTA</th><th>Trimmed FASTA</th><th>FASTQC</th></tr></thead><tbody>" >> $output
+while IFS=, read barcode count location
+	do
+		if [ "total" == "$barcode" ] 
+		then 
+			echo "<tr><td>$barcode</td><td>$count</td><td></td><td></td><td></td><td></td><td></td><td></td></tr>" >> $output
+			break
+		fi
+		file="${name}_${barcode}"
+		mkdir "$outDir/fastqc_$barcode"
+		$workdir/FastQC/fastqc "$file.fastq" -o "$outDir" 2> /dev/null
+		cat "$file.fastq" | awk 'NR%4==1{printf ">%s\n", substr($0,2)}NR%4==2{print}' > "$file.fasta"
+		python $dir/trim.py --input "$file.fasta" --output "${file}_trimmed.fasta" --start "${trim_start[$barcode]}" --end "${trim_end[$barcode]}"
+		echo "<tr><td>$barcode</td><td>$count</td><td><a href='$file.fastq'>$file.fastq</a></td><td><a href='$file.fasta'>$file.fasta</a></td><td><a href='${file}_trimmed.fasta'>${file}_trimmed.fasta</a></td><td><a href='${name}_${barcode}_fastqc.html'>Report</a></td></tr>" >> $output
+done < output.txt
+echo "</tbody></body></html>" >> $output