Mercurial > repos > davidvanzessen > sff_extract_demultiplex
changeset 0:cb08a27e5fc2 draft
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 29 Aug 2016 05:44:57 -0400 |
parents | |
children | cbce7f35f8b0 |
files | demultiplex.xml fastqc_v0.11.2.zip fastx_barcode_splitter.pl sff2fastq trim.py wrapper.sh |
diffstat | 6 files changed, 947 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/demultiplex.xml Mon Aug 29 05:44:57 2016 -0400 @@ -0,0 +1,349 @@ +<tool id="demultiplex" name="Demultiplex" version="1.0"> + <description> </description> + <requirements> + <requirement type="package" version="0.0.13">fastx_toolkit</requirement> + </requirements> + <command interpreter="bash"> + wrapper.sh "$input" "$out_file" "$out_file.files_path" "$where" "$mismatches" "$partial" "$input.name" + #for $i, $b in enumerate($barcodes) + "$b.id" + "$b.mid" + "$b.trim_start" + "$b.trim_end" + #end for + </command> + <inputs> + <param name="input" type="data" label="File to split" /> + <repeat name="barcodes" title="Barcodes" min="1" default="1"> + <param name="id" type="text" size="50" label="ID" /> + <param name="mid" type="select" label="Mid"> + <option value="ACGAGTGCGT">MID-1</option> + <option value="ACGCACTCGT">MID-1 reverse complement</option> + <option value="ACGCTCGACA">MID-2</option> + <option value="TGTCGAGCGT">MID-2 reverse complement</option> + <option value="AGACGCACTC">MID-3</option> + <option value="GAGTGCGTCT">MID-3 reverse complement</option> + <option value="AGCACTGTAG">MID-4</option> + <option value="CTACAGTGCT">MID-4 reverse complement</option> + <option value="ATCAGACACG">MID-5</option> + <option value="CGTGTCTGAT">MID-5 reverse complement</option> + <option value="ATATCGCGAG">MID-6</option> + <option value="CTCGCGATAT">MID-6 reverse complement</option> + <option value="CGTGTCTCTA">MID-7</option> + <option value="TAGAGACACG">MID-7 reverse complement</option> + <option value="CTCGCGTGTC">MID-8</option> + <option value="GACACGCGAG">MID-8 reverse complement</option> + <option value="TCTCTATGCG">MID-10</option> + <option value="CGCATAGAGA">MID-10 reverse complement</option> + <option value="TGATACGTCT">MID-11</option> + <option value="AGACGTATCA">MID-11 reverse complement</option> + <option value="CATAGTAGTG">MID-13</option> + <option value="CACTACTATG">MID-13 reverse complement</option> + <option value="CGAGAGATAC">MID-14</option> + <option value="GTATCTCTCG">MID-14 reverse complement</option> + <option value="ATACGACGTA">MID-15</option> + <option value="TACGTCGTAT">MID-15 reverse complement</option> + <option value="TCACGTACTA">MID-16</option> + <option value="TAGTACGTGA">MID-16 reverse complement</option> + <option value="CGTCTAGTAC">MID-17</option> + <option value="GTACTAGACG">MID-17 reverse complement</option> + <option value="TCTACGTAGC">MID-18</option> + <option value="GCTACGTAGA">MID-18 reverse complement</option> + <option value="TGTACTACTC">MID-19</option> + <option value="GAGTAGTACA">MID-19 reverse complement</option> + <option value="ACGACTACAG">MID-20</option> + <option value="CTGTAGTCGT">MID-20 reverse complement</option> + <option value="CGTAGACTAG">MID-21</option> + <option value="CTAGTCTACG">MID-21 reverse complement</option> + <option value="TACGAGTATG">MID-22</option> + <option value="CATACTCGTA">MID-22 reverse complement</option> + <option value="TACTCTCGTG">MID-23</option> + <option value="CACGAGAGTA">MID-23 reverse complement</option> + <option value="TAGAGACGAG">MID-24</option> + <option value="CTCGTCTCTA">MID-24 reverse complement</option> + <option value="TCGTCGCTCG">MID-25</option> + <option value="CGAGCGACGA">MID-25 reverse complement</option> + <option value="ACATACGCGT">MID-26</option> + <option value="ACGCGTATGT">MID-26 reverse complement</option> + <option value="ACGCGAGTAT">MID-27</option> + <option value="ATACTCGCGT">MID-27 reverse complement</option> + <option value="ACTACTATGT">MID-28</option> + <option value="ACATAGTAGT">MID-28 reverse complement</option> + <option value="ACTGTACAGT">MID-29</option> + <option value="ACTGTACAGT">MID-29 reverse complement</option> + <option value="AGACTATACT">MID-30</option> + <option value="AGTATAGTCT">MID-30 reverse complement</option> + <option value="AGCGTCGTCT">MID-31</option> + <option value="AGACGACGCT">MID-31 reverse complement</option> + <option value="AGTACGCTAT">MID-32</option> + <option value="ATAGCGTACT">MID-32 reverse complement</option> + <option value="ATAGAGTACT">MID-33</option> + <option value="AGTACTCTAT">MID-33 reverse complement</option> + <option value="CACGCTACGT">MID-34</option> + <option value="ACGTAGCGTG">MID-34 reverse complement</option> + <option value="CAGTAGACGT">MID-35</option> + <option value="ACGTCTACTG">MID-35 reverse complement</option> + <option value="CGACGTGACT">MID-36</option> + <option value="AGTCACGTCG">MID-36 reverse complement</option> + <option value="TACACACACT">MID-37</option> + <option value="AGTGTGTGTA">MID-37 reverse complement</option> + <option value="TACACGTGAT">MID-38</option> + <option value="ATCACGTGTA">MID-38 reverse complement</option> + <option value="TACAGATCGT">MID-39</option> + <option value="ACGATCTGTA">MID-39 reverse complement</option> + <option value="TACGCTGTCT">MID-40</option> + <option value="AGACAGCGTA">MID-40 reverse complement</option> + <option value="TAGTGTAGAT">MID-41</option> + <option value="ATCTACACTA">MID-41 reverse complement</option> + <option value="TCGATCACGT">MID-42</option> + <option value="ACGTGATCGA">MID-42 reverse complement</option> + <option value="TCGCACTAGT">MID-43</option> + <option value="ACTAGTGCGA">MID-43 reverse complement</option> + <option value="TCTAGCGACT">MID-44</option> + <option value="AGTCGCTAGA">MID-44 reverse complement</option> + <option value="TCTATACTAT">MID-45</option> + <option value="ATAGTATAGA">MID-45 reverse complement</option> + <option value="TGACGTATGT">MID-46</option> + <option value="ACATACGTCA">MID-46 reverse complement</option> + <option value="TGTGAGTAGT">MID-47</option> + <option value="ACTACTCACA">MID-47 reverse complement</option> + <option value="ACAGTATATA">MID-48</option> + <option value="TATATACTGT">MID-48 reverse complement</option> + <option value="ACGCGATCGA">MID-49</option> + <option value="TCGATCGCGT">MID-49 reverse complement</option> + <option value="ACTAGCAGTA">MID-50</option> + <option value="TACTGCTAGT">MID-50 reverse complement</option> + <option value="AGCTCACGTA">MID-51</option> + <option value="TACGTGAGCT">MID-51 reverse complement</option> + <option value="AGTATACATA">MID-52</option> + <option value="TATGTATACT">MID-52 reverse complement</option> + <option value="AGTCGAGAGA">MID-53</option> + <option value="TCTCTCGACT">MID-53 reverse complement</option> + <option value="AGTGCTACGA">MID-54</option> + <option value="TCGTAGCACT">MID-54 reverse complement</option> + <option value="CGATCGTATA">MID-55</option> + <option value="TATACGATCG">MID-55 reverse complement</option> + <option value="CGCAGTACGA">MID-56</option> + <option value="TCGTACTGCG">MID-56 reverse complement</option> + <option value="CGCGTATACA">MID-57</option> + <option value="TGTATACGCG">MID-57 reverse complement</option> + <option value="CGTACAGTCA">MID-58</option> + <option value="TGACTGTACG">MID-58 reverse complement</option> + <option value="CGTACTCAGA">MID-59</option> + <option value="TCTGAGTACG">MID-59 reverse complement</option> + <option value="CTACGCTCTA">MID-60</option> + <option value="TAGAGCGTAG">MID-60 reverse complement</option> + <option value="CTATAGCGTA">MID-61</option> + <option value="TACGCTATAG">MID-61 reverse complement</option> + <option value="TACGTCATCA">MID-62</option> + <option value="TGATGACGTA">MID-62 reverse complement</option> + <option value="TAGTCGCATA">MID-63</option> + <option value="TATGCGACTA">MID-63 reverse complement</option> + <option value="TATATATACA">MID-64</option> + <option value="TGTATATATA">MID-64 reverse complement</option> + <option value="TATGCTAGTA">MID-65</option> + <option value="TACTAGCATA">MID-65 reverse complement</option> + <option value="TCACGCGAGA">MID-66</option> + <option value="TCTCGCGTGA">MID-66 reverse complement</option> + <option value="TCGATAGTGA">MID-67</option> + <option value="TCACTATCGA">MID-67 reverse complement</option> + <option value="TCGCTGCGTA">MID-68</option> + <option value="TACGCAGCGA">MID-68 reverse complement</option> + <option value="TCTGACGTCA">MID-69</option> + <option value="TGACGTCAGA">MID-69 reverse complement</option> + <option value="TGAGTCAGTA">MID-70</option> + <option value="TACTGACTCA">MID-70 reverse complement</option> + <option value="TGTAGTGTGA">MID-71</option> + <option value="TCACACTACA">MID-71 reverse complement</option> + <option value="TGTCACACGA">MID-72</option> + <option value="TCGTGTGACA">MID-72 reverse complement</option> + <option value="TGTCGTCGCA">MID-73</option> + <option value="TGCGACGACA">MID-73 reverse complement</option> + <option value="ACACATACGC">MID-74</option> + <option value="GCGTATGTGT">MID-74 reverse complement</option> + <option value="ACAGTCGTGC">MID-75</option> + <option value="GCACGACTGT">MID-75 reverse complement</option> + <option value="ACATGACGAC">MID-76</option> + <option value="GTCGTCATGT">MID-76 reverse complement</option> + <option value="ACGACAGCTC">MID-77</option> + <option value="GAGCTGTCGT">MID-77 reverse complement</option> + <option value="ACGTCTCATC">MID-78</option> + <option value="GATGAGACGT">MID-78 reverse complement</option> + <option value="ACTCATCTAC">MID-79</option> + <option value="GTAGATGAGT">MID-79 reverse complement</option> + <option value="ACTCGCGCAC">MID-80</option> + <option value="GTGCGCGAGT">MID-80 reverse complement</option> + <option value="AGAGCGTCAC">MID-81</option> + <option value="GTGACGCTCT">MID-81 reverse complement</option> + <option value="AGCGACTAGC">MID-82</option> + <option value="GCTAGTCGCT">MID-82 reverse complement</option> + <option value="AGTAGTGATC">MID-83</option> + <option value="GATCACTACT">MID-83 reverse complement</option> + <option value="AGTGACACAC">MID-84</option> + <option value="GTGTGTCACT">MID-84 reverse complement</option> + <option value="AGTGTATGTC">MID-85</option> + <option value="GACATACACT">MID-85 reverse complement</option> + <option value="ATAGATAGAC">MID-86</option> + <option value="GTCTATCTAT">MID-86 reverse complement</option> + <option value="ATATAGTCGC">MID-87</option> + <option value="GCGACTATAT">MID-87 reverse complement</option> + <option value="ATCTACTGAC">MID-88</option> + <option value="GTCAGTAGAT">MID-88 reverse complement</option> + <option value="CACGTAGATC">MID-89</option> + <option value="GATCTACGTG">MID-89 reverse complement</option> + <option value="CACGTGTCGC">MID-90</option> + <option value="GCGACACGTG">MID-90 reverse complement</option> + <option value="CATACTCTAC">MID-91</option> + <option value="GTAGAGTATG">MID-91 reverse complement</option> + <option value="CGACACTATC">MID-92</option> + <option value="GATAGTGTCG">MID-92 reverse complement</option> + <option value="CGAGACGCGC">MID-93</option> + <option value="GCGCGTCTCG">MID-93 reverse complement</option> + <option value="CGTATGCGAC">MID-94</option> + <option value="GTCGCATACG">MID-94 reverse complement</option> + <option value="CGTCGATCTC">MID-95</option> + <option value="GAGATCGACG">MID-95 reverse complement</option> + <option value="CTACGACTGC">MID-96</option> + <option value="GCAGTCGTAG">MID-96 reverse complement</option> + <option value="CTAGTCACTC">MID-97</option> + <option value="GAGTGACTAG">MID-97 reverse complement</option> + <option value="CTCTACGCTC">MID-98</option> + <option value="GAGCGTAGAG">MID-98 reverse complement</option> + <option value="CTGTACATAC">MID-99</option> + <option value="GTATGTACAG">MID-99 reverse complement</option> + <option value="TAGACTGCAC">MID-100</option> + <option value="GTGCAGTCTA">MID-100 reverse complement</option> + <option value="TAGCGCGCGC">MID-101</option> + <option value="GCGCGCGCTA">MID-101 reverse complement</option> + <option value="TAGCTCTATC">MID-102</option> + <option value="GATAGAGCTA">MID-102 reverse complement</option> + <option value="TATAGACATC">MID-103</option> + <option value="GATGTCTATA">MID-103 reverse complement</option> + <option value="TATGATACGC">MID-104</option> + <option value="GCGTATCATA">MID-104 reverse complement</option> + <option value="TCACTCATAC">MID-105</option> + <option value="GTATGAGTGA">MID-105 reverse complement</option> + <option value="TCATCGAGTC">MID-106</option> + <option value="GACTCGATGA">MID-106 reverse complement</option> + <option value="TCGAGCTCTC">MID-107</option> + <option value="GAGAGCTCGA">MID-107 reverse complement</option> + <option value="TCGCAGACAC">MID-108</option> + <option value="GTGTCTGCGA">MID-108 reverse complement</option> + <option value="TCTGTCTCGC">MID-109</option> + <option value="GCGAGACAGA">MID-109 reverse complement</option> + <option value="TGAGTGACGC">MID-110</option> + <option value="GCGTCACTCA">MID-110 reverse complement</option> + <option value="TGATGTGTAC">MID-111</option> + <option value="GTACACATCA">MID-111 reverse complement</option> + <option value="TGCTATAGAC">MID-112</option> + <option value="GTCTATAGCA">MID-112 reverse complement</option> + <option value="TGCTCGCTAC">MID-113</option> + <option value="GTAGCGAGCA">MID-113 reverse complement</option> + <option value="ACGTGCAGCG">MID-114</option> + <option value="CGCTGCACGT">MID-114 reverse complement</option> + <option value="ACTCACAGAG">MID-115</option> + <option value="CTCTGTGAGT">MID-115 reverse complement</option> + <option value="AGACTCAGCG">MID-116</option> + <option value="CGCTGAGTCT">MID-116 reverse complement</option> + <option value="AGAGAGTGTG">MID-117</option> + <option value="CACACTCTCT">MID-117 reverse complement</option> + <option value="AGCTATCGCG">MID-118</option> + <option value="CGCGATAGCT">MID-118 reverse complement</option> + <option value="AGTCTGACTG">MID-119</option> + <option value="CAGTCAGACT">MID-119 reverse complement</option> + <option value="AGTGAGCTCG">MID-120</option> + <option value="CGAGCTCACT">MID-120 reverse complement</option> + <option value="ATAGCTCTCG">MID-121</option> + <option value="CGAGAGCTAT">MID-121 reverse complement</option> + <option value="ATCACGTGCG">MID-122</option> + <option value="CGCACGTGAT">MID-122 reverse complement</option> + <option value="ATCGTAGCAG">MID-123</option> + <option value="CTGCTACGAT">MID-123 reverse complement</option> + <option value="ATCGTCTGTG">MID-124</option> + <option value="CACAGACGAT">MID-124 reverse complement</option> + <option value="ATGTACGATG">MID-125</option> + <option value="CATCGTACAT">MID-125 reverse complement</option> + <option value="ATGTGTCTAG">MID-126</option> + <option value="CTAGACACAT">MID-126 reverse complement</option> + <option value="CACACGATAG">MID-127</option> + <option value="CTATCGTGTG">MID-127 reverse complement</option> + <option value="CACTCGCACG">MID-128</option> + <option value="CGTGCGAGTG">MID-128 reverse complement</option> + <option value="CAGACGTCTG">MID-129</option> + <option value="CAGACGTCTG">MID-129 reverse complement</option> + <option value="CAGTACTGCG">MID-130</option> + <option value="CGCAGTACTG">MID-130 reverse complement</option> + <option value="CGACAGCGAG">MID-131</option> + <option value="CTCGCTGTCG">MID-131 reverse complement</option> + <option value="CGATCTGTCG">MID-132</option> + <option value="CGACAGATCG">MID-132 reverse complement</option> + <option value="CGCGTGCTAG">MID-133</option> + <option value="CTAGCACGCG">MID-133 reverse complement</option> + <option value="CGCTCGAGTG">MID-134</option> + <option value="CACTCGAGCG">MID-134 reverse complement</option> + <option value="CGTGATGACG">MID-135</option> + <option value="CGTCATCACG">MID-135 reverse complement</option> + <option value="CTATGTACAG">MID-136</option> + <option value="CTGTACATAG">MID-136 reverse complement</option> + <option value="CTCGATATAG">MID-137</option> + <option value="CTATATCGAG">MID-137 reverse complement</option> + <option value="CTCGCACGCG">MID-138</option> + <option value="CGCGTGCGAG">MID-138 reverse complement</option> + <option value="CTGCGTCACG">MID-139</option> + <option value="CGTGACGCAG">MID-139 reverse complement</option> + <option value="CTGTGCGTCG">MID-140</option> + <option value="CGACGCACAG">MID-140 reverse complement</option> + <option value="TAGCATACTG">MID-141</option> + <option value="CAGTATGCTA">MID-141 reverse complement</option> + <option value="TATACATGTG">MID-142</option> + <option value="CACATGTATA">MID-142 reverse complement</option> + <option value="TATCACTCAG">MID-143</option> + <option value="CTGAGTGATA">MID-143 reverse complement</option> + <option value="TATCTGATAG">MID-144</option> + <option value="CTATCAGATA">MID-144 reverse complement</option> + <option value="TCGTGACATG">MID-145</option> + <option value="CATGTCACGA">MID-145 reverse complement</option> + <option value="TCTGATCGAG">MID-146</option> + <option value="CTCGATCAGA">MID-146 reverse complement</option> + <option value="TGACATCTCG">MID-147</option> + <option value="CGAGATGTCA">MID-147 reverse complement</option> + <option value="TGAGCTAGAG">MID-148</option> + <option value="CTCTAGCTCA">MID-148 reverse complement</option> + <option value="TGATAGAGCG">MID-149</option> + <option value="CGCTCTATCA">MID-149 reverse complement</option> + <option value="TGCGTGTGCG">MID-150</option> + <option value="CGCACACGCA">MID-150 reverse complement</option> + <option value="TGCTAGTCAG">MID-151</option> + <option value="CTGACTAGCA">MID-151 reverse complement</option> + <option value="TGTATCACAG">MID-152</option> + <option value="CTGTGATACA">MID-152 reverse complement</option> + <option value="TGTGCGCGTG">MID-153</option> + <option value="CACGCGCACA">MID-153 reverse complement</option> + </param> + + <param name="trim_start" type="integer" size="3" value="0" label="How many nucleotides to trim from the start" /> + + <param name="trim_end" type="integer" size="3" value="0" label="How many nucleotides to trim from the end" /> + </repeat> + + <param name="where" type="select" label="Barcodes found at"> + <option value="bol">Start: 5' end</option> + <option value="eol">End: 3' end</option> + </param> + + <param name="mismatches" type="integer" size="3" value="2" label="Max. number of mismatches allowed." /> + + <param name="partial" type="integer" size="3" value="0" label="Allow partial overlap of barcodes." /> + + </inputs> + <outputs> + <data format="html" name="out_file" /> + </outputs> + <help> +- Splitting sff or fastq files into FASTQ, FASTA and (optional) trimmed FASTA files with a FASTQC report on the FASTQ file, this tool uses: +- sff2fastq (https://github.com/indraniel/sff2fastq) to extract a fastq file. +- fastx_barcode_splitter.pl (http://hannonlab.cshl.edu/fastx_toolkit/commandline.html) to demultiplex. +- fastqc (http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) to provide analysis of the fastq files. + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastx_barcode_splitter.pl Mon Aug 29 05:44:57 2016 -0400 @@ -0,0 +1,472 @@ +#!/usr/bin/perl + +# FASTX-toolkit - FASTA/FASTQ preprocessing tools. +# Copyright (C) 2009-2013 A. Gordon (assafgordon@gmail.com) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +use strict; +use warnings; +use IO::Handle; +use Data::Dumper; +use Getopt::Long; +use Carp; + +## +## This program splits a FASTQ/FASTA file into several smaller files, +## Based on barcode matching. +## +## run with "--help" for usage information +## +## Assaf Gordon <assafgordon@gmail.com> , 11sep2008 + +# Forward declarations +sub load_barcode_file ($); +sub parse_command_line ; +sub match_sequences ; +sub mismatch_count($$) ; +sub print_results; +sub open_and_detect_input_format; +sub read_record; +sub write_record($); +sub usage(); + +# Global flags and arguments, +# Set by command line argumens +my $barcode_file ; +my $barcodes_at_eol = 0 ; +my $barcodes_at_bol = 0 ; +my $exact_match = 0 ; +my $allow_partial_overlap = 0; +my $allowed_mismatches = 1; +my $newfile_suffix = ''; +my $newfile_prefix ; +my $quiet = 0 ; +my $debug = 0 ; +my $fastq_format = 1; + +# Global variables +# Populated by 'create_output_files' +my %filenames; +my %files; +my %counts = ( 'unmatched' => 0 ); +my $barcodes_length; +my @barcodes; +my $input_file_io; + + +# The Four lines per record in FASTQ format. +# (when using FASTA format, only the first two are used) +my $seq_name; +my $seq_bases; +my $seq_name2; +my $seq_qualities; + + +# +# Start of Program +# +parse_command_line ; + +load_barcode_file ( $barcode_file ) ; + +open_and_detect_input_format; + +match_sequences ; + +print_results unless $quiet; + +# +# End of program +# + + + + + + + + +sub parse_command_line { + my $help; + + usage() if (scalar @ARGV==0); + + my $result = GetOptions ( "bcfile=s" => \$barcode_file, + "eol" => \$barcodes_at_eol, + "bol" => \$barcodes_at_bol, + "exact" => \$exact_match, + "prefix=s" => \$newfile_prefix, + "suffix=s" => \$newfile_suffix, + "quiet" => \$quiet, + "partial=i" => \$allow_partial_overlap, + "debug" => \$debug, + "mismatches=i" => \$allowed_mismatches, + "help" => \$help + ) ; + + usage() if ($help); + + die "Error: barcode file not specified (use '--bcfile [FILENAME]')\n" unless defined $barcode_file; + die "Error: prefix path/filename not specified (use '--prefix [PATH]')\n" unless defined $newfile_prefix; + + if ($barcodes_at_bol == $barcodes_at_eol) { + die "Error: can't specify both --eol & --bol\n" if $barcodes_at_eol; + die "Error: must specify either --eol or --bol\n" ; + } + + die "Error: invalid for value partial matches (valid values are 0 or greater)\n" if $allow_partial_overlap<0; + + $allowed_mismatches = 0 if $exact_match; + + die "Error: invalid value for mismatches (valid values are 0 or more)\n" if ($allowed_mismatches<0); + + die "Error: partial overlap value ($allow_partial_overlap) bigger than " . + "max. allowed mismatches ($allowed_mismatches)\n" if ($allow_partial_overlap > $allowed_mismatches); + + + exit unless $result; +} + + + +# +# Read the barcode file +# +sub load_barcode_file ($) { + my $filename = shift or croak "Missing barcode file name"; + + open BCFILE,"<$filename" or die "Error: failed to open barcode file ($filename)\n"; + while (<BCFILE>) { + next if m/^#/; + chomp; + my ($ident, $barcode) = split ; + + $barcode = uc($barcode); + + # Sanity checks on the barcodes + die "Error: bad data at barcode file ($filename) line $.\n" unless defined $barcode; + die "Error: bad barcode value ($barcode) at barcode file ($filename) line $.\n" + unless $barcode =~ m/^[AGCT]+$/; + + die "Error: bad identifier value ($ident) at barcode file ($filename) line $. (must be alphanumeric)\n" + unless $ident =~ m/^\w+$/; + + die "Error: badcode($ident, $barcode) is shorter or equal to maximum number of " . + "mismatches ($allowed_mismatches). This makes no sense. Specify fewer mismatches.\n" + if length($barcode)<=$allowed_mismatches; + + $barcodes_length = length($barcode) unless defined $barcodes_length; + die "Error: found barcodes in different lengths. this feature is not supported yet.\n" + unless $barcodes_length == length($barcode); + + push @barcodes, [$ident, $barcode]; + + if ($allow_partial_overlap>0) { + foreach my $i (1 .. $allow_partial_overlap) { + substr $barcode, ($barcodes_at_bol)?0:-1, 1, ''; + push @barcodes, [$ident, $barcode]; + } + } + } + close BCFILE; + + if ($debug) { + print STDERR "barcode\tsequence\n"; + foreach my $barcoderef (@barcodes) { + my ($ident, $seq) = @{$barcoderef}; + print STDERR $ident,"\t", $seq ,"\n"; + } + } +} + +# Create one output file for each barcode. +# (Also create a file for the dummy 'unmatched' barcode) +sub create_output_files { + my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers; + $barcodes{'unmatched'} = 1 ; + + foreach my $ident (keys %barcodes) { + my $new_filename = $newfile_prefix . $ident . $newfile_suffix; + $filenames{$ident} = $new_filename; + open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n"; + $files{$ident} = $file ; + } +} + +sub match_sequences { + + my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers; + $barcodes{'unmatched'} = 1 ; + + #reset counters + foreach my $ident ( keys %barcodes ) { + $counts{$ident} = 0; + } + + create_output_files; + + # Read file FASTQ file + # split accotding to barcodes + while ( read_record ) { + chomp $seq_bases; + + print STDERR "sequence $seq_bases: \n" if $debug; + + my $best_barcode_mismatches_count = $barcodes_length; + my $best_barcode_ident = undef; + + #Try all barcodes, find the one with the lowest mismatch count + foreach my $barcoderef (@barcodes) { + my ($ident, $barcode) = @{$barcoderef}; + + # Get DNA fragment (in the length of the barcodes) + # The barcode will be tested only against this fragment + # (no point in testing the barcode against the whole sequence) + my $sequence_fragment; + if ($barcodes_at_bol) { + $sequence_fragment = substr $seq_bases, 0, $barcodes_length; + } else { + $sequence_fragment = substr $seq_bases, - $barcodes_length; + } + + my $mm = mismatch_count($sequence_fragment, $barcode) ; + + # if this is a partial match, add the non-overlap as a mismatch + # (partial barcodes are shorter than the length of the original barcodes) + $mm += ($barcodes_length - length($barcode)); + + if ( $mm < $best_barcode_mismatches_count ) { + $best_barcode_mismatches_count = $mm ; + $best_barcode_ident = $ident ; + } + } + + $best_barcode_ident = 'unmatched' + if ( (!defined $best_barcode_ident) || $best_barcode_mismatches_count>$allowed_mismatches) ; + + print STDERR "sequence $seq_bases matched barcode: $best_barcode_ident\n" if $debug; + + $counts{$best_barcode_ident}++; + + #get the file associated with the matched barcode. + #(note: there's also a file associated with 'unmatched' barcode) + my $file = $files{$best_barcode_ident}; + + write_record($file); + } +} + +#Quickly calculate hamming distance between two strings +# +#NOTE: Strings must be same length. +# returns number of different characters. +#see http://www.perlmonks.org/?node_id=500235 +sub mismatch_count($$) { length( $_[ 0 ] ) - ( ( $_[ 0 ] ^ $_[ 1 ] ) =~ tr[\0][\0] ) } + + + +sub print_results +{ + print "Barcode\tCount\tLocation\n"; + my $total = 0 ; + foreach my $ident (sort keys %counts) { + print $ident, "\t", $counts{$ident},"\t",$filenames{$ident},"\n"; + $total += $counts{$ident}; + } + print "total\t",$total,"\n"; +} + + +sub read_record +{ + $seq_name = $input_file_io->getline(); + + return undef unless defined $seq_name; # End of file? + + $seq_bases = $input_file_io->getline(); + die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases; + + # If using FASTQ format, read two more lines + if ($fastq_format) { + $seq_name2 = $input_file_io->getline(); + die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2; + + $seq_qualities = $input_file_io->getline(); + die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities; + } + return 1; +} + +sub write_record($) +{ + my $file = shift; + + croak "Bad file handle" unless defined $file; + + print $file $seq_name; + print $file $seq_bases,"\n"; + + #if using FASTQ format, write two more lines + if ($fastq_format) { + print $file $seq_name2; + print $file $seq_qualities; + } +} + +sub open_and_detect_input_format +{ + $input_file_io = new IO::Handle; + die "Failed to open STDIN " unless $input_file_io->fdopen(fileno(STDIN),"r"); + + # Get the first characeter, and push it back + my $first_char = $input_file_io->getc(); + $input_file_io->ungetc(ord $first_char); + + if ($first_char eq '>') { + # FASTA format + $fastq_format = 0 ; + print STDERR "Detected FASTA format\n" if $debug; + } elsif ($first_char eq '@') { + # FASTQ format + $fastq_format = 1; + print STDERR "Detected FASTQ format\n" if $debug; + } else { + die "Error: unknown file format. First character = '$first_char' (expecting > or \@)\n"; + } +} + +sub usage() +{ +print<<EOF; +Barcode Splitter, by Assaf Gordon (gordon\@cshl.edu), 11sep2008 + +This program reads FASTA/FASTQ file and splits it into several smaller files, +Based on barcode matching. +FASTA/FASTQ data is read from STDIN (format is auto-detected.) +Output files will be writen to disk. +Summary will be printed to STDOUT. + +usage: $0 --bcfile FILE --prefix PREFIX [--suffix SUFFIX] [--bol|--eol] + [--mismatches N] [--exact] [--partial N] [--help] [--quiet] [--debug] + +Arguments: + +--bcfile FILE - Barcodes file name. (see explanation below.) +--prefix PREFIX - File prefix. will be added to the output files. Can be used + to specify output directories. +--suffix SUFFIX - File suffix (optional). Can be used to specify file + extensions. +--bol - Try to match barcodes at the BEGINNING of sequences. + (What biologists would call the 5' end, and programmers + would call index 0.) +--eol - Try to match barcodes at the END of sequences. + (What biologists would call the 3' end, and programmers + would call the end of the string.) + NOTE: one of --bol, --eol must be specified, but not both. +--mismatches N - Max. number of mismatches allowed. default is 1. +--exact - Same as '--mismatches 0'. If both --exact and --mismatches + are specified, '--exact' takes precedence. +--partial N - Allow partial overlap of barcodes. (see explanation below.) + (Default is not partial matching) +--quiet - Don't print counts and summary at the end of the run. + (Default is to print.) +--debug - Print lots of useless debug information to STDERR. +--help - This helpful help screen. + +Example (Assuming 's_2_100.txt' is a FASTQ file, 'mybarcodes.txt' is +the barcodes file): + + \$ cat s_2_100.txt | $0 --bcfile mybarcodes.txt --bol --mismatches 2 \\ + --prefix /tmp/bla_ --suffix ".txt" + +Barcode file format +------------------- +Barcode files are simple text files. Each line should contain an identifier +(descriptive name for the barcode), and the barcode itself (A/C/G/T), +separated by a TAB character. Example: + + #This line is a comment (starts with a 'number' sign) + BC1 GATCT + BC2 ATCGT + BC3 GTGAT + BC4 TGTCT + +For each barcode, a new FASTQ file will be created (with the barcode's +identifier as part of the file name). Sequences matching the barcode +will be stored in the appropriate file. + +Running the above example (assuming "mybarcodes.txt" contains the above +barcodes), will create the following files: + /tmp/bla_BC1.txt + /tmp/bla_BC2.txt + /tmp/bla_BC3.txt + /tmp/bla_BC4.txt + /tmp/bla_unmatched.txt +The 'unmatched' file will contain all sequences that didn't match any barcode. + +Barcode matching +---------------- + +** Without partial matching: + +Count mismatches between the FASTA/Q sequences and the barcodes. +The barcode which matched with the lowest mismatches count (providing the +count is small or equal to '--mismatches N') 'gets' the sequences. + +Example (using the above barcodes): +Input Sequence: + GATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG + +Matching with '--bol --mismatches 1': + GATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG + GATCT (1 mismatch, BC1) + ATCGT (4 mismatches, BC2) + GTGAT (3 mismatches, BC3) + TGTCT (3 mismatches, BC4) + +This sequence will be classified as 'BC1' (it has the lowest mismatch count). +If '--exact' or '--mismatches 0' were specified, this sequence would be +classified as 'unmatched' (because, although BC1 had the lowest mismatch count, +it is above the maximum allowed mismatches). + +Matching with '--eol' (end of line) does the same, but from the other side +of the sequence. + +** With partial matching (very similar to indels): + +Same as above, with the following addition: barcodes are also checked for +partial overlap (number of allowed non-overlapping bases is '--partial N'). + +Example: +Input sequence is ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG +(Same as above, but note the missing 'G' at the beginning.) + +Matching (without partial overlapping) against BC1 yields 4 mismatches: + ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG + GATCT (4 mismatches) + +Partial overlapping would also try the following match: + -ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG + GATCT (1 mismatch) + +Note: scoring counts a missing base as a mismatch, so the final +mismatch count is 2 (1 'real' mismatch, 1 'missing base' mismatch). +If running with '--mismatches 2' (meaning allowing upto 2 mismatches) - this +seqeunce will be classified as BC1. + +EOF + +exit 1; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/trim.py Mon Aug 29 05:44:57 2016 -0400 @@ -0,0 +1,59 @@ +import argparse + +#docs.python.org/dev/library/argparse.html +parser = argparse.ArgumentParser() +parser.add_argument("--input", help="Input fasta") +parser.add_argument("--output", help="Output fasta") +parser.add_argument("--start", help="How many nucleotides to trim from the start", type=int) +parser.add_argument("--end", help="How many nucleotides to trim from the end", type=int) + +args = parser.parse_args() +start = int(args.start) +end = int(args.end) + +print args.input +print args.output +print start +print end + +if end <= 0 and start <= 0: + import shutil + shutil.copy(args.input, args.output) + import sys + sys.exit() + + + +currentSeq = "" +currentId = "" + +if end is 0: + with open(args.input, 'r') as i: + with open(args.output, 'w') as o: + for line in i.readlines(): + if line[0] is ">": + currentSeq = currentSeq[start:] + if currentSeq is not "" and currentId is not "": + o.write(currentId) + o.write(currentSeq + "\n") + currentId = line + currentSeq = "" + else: + currentSeq += line.rstrip() + o.write(currentId) + o.write(currentSeq[start:] + "\n") +else: + with open(args.input, 'r') as i: + with open(args.output, 'w') as o: + for line in i.readlines(): + if line[0] is ">": + currentSeq = currentSeq[start:-end] + if currentSeq is not "" and currentId is not "": + o.write(currentId) + o.write(currentSeq + "\n") + currentId = line + currentSeq = "" + else: + currentSeq += line.rstrip() + o.write(currentId) + o.write(currentSeq[start:-end] + "\n")
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/wrapper.sh Mon Aug 29 05:44:57 2016 -0400 @@ -0,0 +1,67 @@ +#!/bin/bash +input="$1" +output="$2" +outDir="$3" +mkdir "$outDir" +EOL="$4" +mismatches="$5" +partial="$6" +name=$(basename "$7") +ext="${name##*.}" +name="${name%.*}" +name="${name// /_}" +prefix="${name}_" +dir="$(cd "$(dirname "$0")" && pwd)" + +unzip $dir/fastqc_v0.11.2.zip -d $PWD/ > $PWD/unziplog.log +chmod 755 $PWD/FastQC/fastqc + +declare -A trim_start +declare -A trim_end +for ((i=8;i<=$#;i=i+4)) +do + j=$((i+1)) + start_int=$((i+2)) + end_int=$((i+3)) + id="${!i}" + echo "$id, ${start_int}, ${end_int}" + trim_start[$id]=${!start_int} + trim_end[$id]=${!end_int} + echo -e "$id\t${!j}" >> $outDir/barcodes.txt + +done +trim_start["unmatched"]=0 +trim_end["unmatched"]=0 + +echo "trim_start = ${trim_start[@]}" +echo "trim_end = ${trim_end[@]}" + +workdir=$PWD +cd $outDir +echo "$3" +filetype=`file $input` +result="" +if [[ $filetype == *ASCII* ]] +then + result=`cat $input | $dir/fastx_barcode_splitter.pl --bcfile $outDir/barcodes.txt --prefix "$prefix" --suffix ".fastq" --$EOL --mismatches $mismatches --partial $partial` +else + result=`$dir/sff2fastq $input | $dir/fastx_barcode_splitter.pl --bcfile $outDir/barcodes.txt --prefix "$prefix" --suffix ".fastq" --$EOL --mismatches $mismatches --partial $partial` +fi + +echo "$result" | tail -n +2 | sed 's/\t/,/g' > output.txt +echo "<html><head><title>$name demultiplex</title></head><body><table border='1'><thead><tr><th>ID</th><th>Count</th><th>FASTQ</th><th>FASTA</th><th>Trimmed FASTA</th><th>FASTQC</th></tr></thead><tbody>" >> $output +while IFS=, read barcode count location + do + if [ "total" == "$barcode" ] + then + echo "<tr><td>$barcode</td><td>$count</td><td></td><td></td><td></td><td></td><td></td><td></td></tr>" >> $output + break + fi + file="${name}_${barcode}" + mkdir "$outDir/fastqc_$barcode" + $workdir/FastQC/fastqc "$file.fastq" -o "$outDir" 2> /dev/null + cat "$file.fastq" | awk 'NR%4==1{printf ">%s\n", substr($0,2)}NR%4==2{print}' > "$file.fasta" + python $dir/trim.py --input "$file.fasta" --output "${file}_trimmed.fasta" --start "${trim_start[$barcode]}" --end "${trim_end[$barcode]}" + echo "<tr><td>$barcode</td><td>$count</td><td><a href='$file.fastq'>$file.fastq</a></td><td><a href='$file.fasta'>$file.fasta</a></td><td><a href='${file}_trimmed.fasta'>${file}_trimmed.fasta</a></td><td><a href='${name}_${barcode}_fastqc.html'>Report</a></td></tr>" >> $output +done < output.txt +echo "</tbody></body></html>" >> $output