Next changeset 1:cbce7f35f8b0 (2016-12-21) |
Commit message:
Uploaded |
added:
demultiplex.xml fastqc_v0.11.2.zip fastx_barcode_splitter.pl sff2fastq trim.py wrapper.sh |
b |
diff -r 000000000000 -r cb08a27e5fc2 demultiplex.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/demultiplex.xml Mon Aug 29 05:44:57 2016 -0400 |
b |
b'@@ -0,0 +1,349 @@\n+<tool id="demultiplex" name="Demultiplex" version="1.0">\n+\t<description> </description>\n+\t<requirements>\n+\t\t<requirement type="package" version="0.0.13">fastx_toolkit</requirement>\n+\t</requirements>\n+\t<command interpreter="bash">\n+\t\twrapper.sh "$input" "$out_file" "$out_file.files_path" "$where" "$mismatches" "$partial" "$input.name"\n+\t\t#for $i, $b in enumerate($barcodes)\n+ "$b.id"\n+ "$b.mid"\n+ "$b.trim_start"\n+ "$b.trim_end"\n+ #end for\n+\t</command>\n+\t<inputs>\n+\t\t<param name="input" type="data" label="File to split" />\n+\t\t<repeat name="barcodes" title="Barcodes" min="1" default="1">\n+ <param name="id" type="text" size="50" label="ID" />\n+\t\t\t<param name="mid" type="select" label="Mid">\n+\t\t\t\t<option value="ACGAGTGCGT">MID-1</option>\n+\t\t\t\t<option value="ACGCACTCGT">MID-1 reverse complement</option>\n+\t\t\t\t<option value="ACGCTCGACA">MID-2</option>\n+\t\t\t\t<option value="TGTCGAGCGT">MID-2 reverse complement</option>\n+\t\t\t\t<option value="AGACGCACTC">MID-3</option>\n+\t\t\t\t<option value="GAGTGCGTCT">MID-3 reverse complement</option>\n+\t\t\t\t<option value="AGCACTGTAG">MID-4</option>\n+\t\t\t\t<option value="CTACAGTGCT">MID-4 reverse complement</option>\n+\t\t\t\t<option value="ATCAGACACG">MID-5</option>\n+\t\t\t\t<option value="CGTGTCTGAT">MID-5 reverse complement</option>\n+\t\t\t\t<option value="ATATCGCGAG">MID-6</option>\n+\t\t\t\t<option value="CTCGCGATAT">MID-6 reverse complement</option>\n+\t\t\t\t<option value="CGTGTCTCTA">MID-7</option>\n+\t\t\t\t<option value="TAGAGACACG">MID-7 reverse complement</option>\n+\t\t\t\t<option value="CTCGCGTGTC">MID-8</option>\n+\t\t\t\t<option value="GACACGCGAG">MID-8 reverse complement</option>\n+\t\t\t\t<option value="TCTCTATGCG">MID-10</option>\n+\t\t\t\t<option value="CGCATAGAGA">MID-10 reverse complement</option>\n+\t\t\t\t<option value="TGATACGTCT">MID-11</option>\n+\t\t\t\t<option value="AGACGTATCA">MID-11 reverse complement</option>\n+\t\t\t\t<option value="CATAGTAGTG">MID-13</option>\n+\t\t\t\t<option value="CACTACTATG">MID-13 reverse complement</option>\n+\t\t\t\t<option value="CGAGAGATAC">MID-14</option>\n+\t\t\t\t<option value="GTATCTCTCG">MID-14 reverse complement</option>\n+\t\t\t\t<option value="ATACGACGTA">MID-15</option>\n+\t\t\t\t<option value="TACGTCGTAT">MID-15 reverse complement</option>\n+\t\t\t\t<option value="TCACGTACTA">MID-16</option>\n+\t\t\t\t<option value="TAGTACGTGA">MID-16 reverse complement</option>\n+\t\t\t\t<option value="CGTCTAGTAC">MID-17</option>\n+\t\t\t\t<option value="GTACTAGACG">MID-17 reverse complement</option>\n+\t\t\t\t<option value="TCTACGTAGC">MID-18</option>\n+\t\t\t\t<option value="GCTACGTAGA">MID-18 reverse complement</option>\n+\t\t\t\t<option value="TGTACTACTC">MID-19</option>\n+\t\t\t\t<option value="GAGTAGTACA">MID-19 reverse complement</option>\n+\t\t\t\t<option value="ACGACTACAG">MID-20</option>\n+\t\t\t\t<option value="CTGTAGTCGT">MID-20 reverse complement</option>\n+\t\t\t\t<option value="CGTAGACTAG">MID-21</option>\n+\t\t\t\t<option value="CTAGTCTACG">MID-21 reverse complement</option>\n+\t\t\t\t<option value="TACGAGTATG">MID-22</option>\n+\t\t\t\t<option value="CATACTCGTA">MID-22 reverse complement</option>\n+\t\t\t\t<option value="TACTCTCGTG">MID-23</option>\n+\t\t\t\t<option value="CACGAGAGTA">MID-23 reverse complement</option>\n+\t\t\t\t<option value="TAGAGACGAG">MID-24</option>\n+\t\t\t\t<option value="CTCGTCTCTA">MID-24 reverse complement</option>\n+\t\t\t\t<option value="TCGTCGCTCG">MID-25</option>\n+\t\t\t\t<option value="CGAGCGACGA">MID-25 reverse complement</option>\n+\t\t\t\t<option value="ACATACGCGT">MID-26</option>\n+\t\t\t\t<option value="ACGCGTATGT">MID-26 reverse complement</option>\n+\t\t\t\t<option value="ACGCGAGTAT">MID-27</option>\n+\t\t\t\t<option value="ATACTCGCGT">MID-27 reverse complement</option>\n+\t\t\t\t<option value="ACTACTATGT">MID-28</option>\n+\t\t\t\t<option value="ACATAGTAGT">MID-28 reverse complement</option>\n+\t\t\t\t<option value="ACTGTACAGT">MID-29</option>\n+\t\t\t\t<option value="ACTGTACAGT">MID-29 reverse complement</option>\n+\t\t\t\t<option value="AGACTATACT">MID-30</option>\n+\t\t\t\t<option value="AGTATAGTCT">MID-30 reverse complement</option>\n+\t\t\t\t<option value="AGCGTCGTCT'..b'="CAGTACTGCG">MID-130</option>\n+\t\t\t\t<option value="CGCAGTACTG">MID-130 reverse complement</option>\n+\t\t\t\t<option value="CGACAGCGAG">MID-131</option>\n+\t\t\t\t<option value="CTCGCTGTCG">MID-131 reverse complement</option>\n+\t\t\t\t<option value="CGATCTGTCG">MID-132</option>\n+\t\t\t\t<option value="CGACAGATCG">MID-132 reverse complement</option>\n+\t\t\t\t<option value="CGCGTGCTAG">MID-133</option>\n+\t\t\t\t<option value="CTAGCACGCG">MID-133 reverse complement</option>\n+\t\t\t\t<option value="CGCTCGAGTG">MID-134</option>\n+\t\t\t\t<option value="CACTCGAGCG">MID-134 reverse complement</option>\n+\t\t\t\t<option value="CGTGATGACG">MID-135</option>\n+\t\t\t\t<option value="CGTCATCACG">MID-135 reverse complement</option>\n+\t\t\t\t<option value="CTATGTACAG">MID-136</option>\n+\t\t\t\t<option value="CTGTACATAG">MID-136 reverse complement</option>\n+\t\t\t\t<option value="CTCGATATAG">MID-137</option>\n+\t\t\t\t<option value="CTATATCGAG">MID-137 reverse complement</option>\n+\t\t\t\t<option value="CTCGCACGCG">MID-138</option>\n+\t\t\t\t<option value="CGCGTGCGAG">MID-138 reverse complement</option>\n+\t\t\t\t<option value="CTGCGTCACG">MID-139</option>\n+\t\t\t\t<option value="CGTGACGCAG">MID-139 reverse complement</option>\n+\t\t\t\t<option value="CTGTGCGTCG">MID-140</option>\n+\t\t\t\t<option value="CGACGCACAG">MID-140 reverse complement</option>\n+\t\t\t\t<option value="TAGCATACTG">MID-141</option>\n+\t\t\t\t<option value="CAGTATGCTA">MID-141 reverse complement</option>\n+\t\t\t\t<option value="TATACATGTG">MID-142</option>\n+\t\t\t\t<option value="CACATGTATA">MID-142 reverse complement</option>\n+\t\t\t\t<option value="TATCACTCAG">MID-143</option>\n+\t\t\t\t<option value="CTGAGTGATA">MID-143 reverse complement</option>\n+\t\t\t\t<option value="TATCTGATAG">MID-144</option>\n+\t\t\t\t<option value="CTATCAGATA">MID-144 reverse complement</option>\n+\t\t\t\t<option value="TCGTGACATG">MID-145</option>\n+\t\t\t\t<option value="CATGTCACGA">MID-145 reverse complement</option>\n+\t\t\t\t<option value="TCTGATCGAG">MID-146</option>\n+\t\t\t\t<option value="CTCGATCAGA">MID-146 reverse complement</option>\n+\t\t\t\t<option value="TGACATCTCG">MID-147</option>\n+\t\t\t\t<option value="CGAGATGTCA">MID-147 reverse complement</option>\n+\t\t\t\t<option value="TGAGCTAGAG">MID-148</option>\n+\t\t\t\t<option value="CTCTAGCTCA">MID-148 reverse complement</option>\n+\t\t\t\t<option value="TGATAGAGCG">MID-149</option>\n+\t\t\t\t<option value="CGCTCTATCA">MID-149 reverse complement</option>\n+\t\t\t\t<option value="TGCGTGTGCG">MID-150</option>\n+\t\t\t\t<option value="CGCACACGCA">MID-150 reverse complement</option>\n+\t\t\t\t<option value="TGCTAGTCAG">MID-151</option>\n+\t\t\t\t<option value="CTGACTAGCA">MID-151 reverse complement</option>\n+\t\t\t\t<option value="TGTATCACAG">MID-152</option>\n+\t\t\t\t<option value="CTGTGATACA">MID-152 reverse complement</option>\n+\t\t\t\t<option value="TGTGCGCGTG">MID-153</option>\n+\t\t\t\t<option value="CACGCGCACA">MID-153 reverse complement</option>\n+\t\t\t</param>\n+\t\t\t\n+\t\t\t<param name="trim_start" type="integer" size="3" value="0" label="How many nucleotides to trim from the start" />\n+\t\t\n+\t\t\t<param name="trim_end" type="integer" size="3" value="0" label="How many nucleotides to trim from the end" />\n+\t\t</repeat>\n+\t\t\n+\t\t<param name="where" type="select" label="Barcodes found at">\n+\t\t\t<option value="bol">Start: 5\' end</option>\n+\t\t\t<option value="eol">End: 3\' end</option>\n+\t\t</param>\n+\t\t\n+\t\t<param name="mismatches" type="integer" size="3" value="2" label="Max. number of mismatches allowed." />\n+\t\t\n+\t\t<param name="partial" type="integer" size="3" value="0" label="Allow partial overlap of barcodes." />\n+\t\t\n+\t</inputs>\n+\t<outputs>\n+\t\t<data format="html" name="out_file" />\n+\t</outputs>\n+\t<help>\n+- Splitting sff or fastq files into FASTQ, FASTA and (optional) trimmed FASTA files with a FASTQC report on the FASTQ file, this tool uses:\n+- sff2fastq (https://github.com/indraniel/sff2fastq) to extract a fastq file.\n+- fastx_barcode_splitter.pl (http://hannonlab.cshl.edu/fastx_toolkit/commandline.html) to demultiplex.\n+- fastqc (http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) to provide analysis of the fastq files.\n+\t\t\n+\t</help>\n+</tool>\n' |
b |
diff -r 000000000000 -r cb08a27e5fc2 fastqc_v0.11.2.zip |
b |
Binary file fastqc_v0.11.2.zip has changed |
b |
diff -r 000000000000 -r cb08a27e5fc2 fastx_barcode_splitter.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastx_barcode_splitter.pl Mon Aug 29 05:44:57 2016 -0400 |
[ |
b'@@ -0,0 +1,472 @@\n+#!/usr/bin/perl\n+\n+# FASTX-toolkit - FASTA/FASTQ preprocessing tools.\n+# Copyright (C) 2009-2013 A. Gordon (assafgordon@gmail.com)\n+#\n+# This program is free software: you can redistribute it and/or modify\n+# it under the terms of the GNU Affero General Public License as\n+# published by the Free Software Foundation, either version 3 of the\n+# License, or (at your option) any later version.\n+#\n+# This program is distributed in the hope that it will be useful,\n+# but WITHOUT ANY WARRANTY; without even the implied warranty of\n+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n+# GNU Affero General Public License for more details.\n+#\n+# You should have received a copy of the GNU Affero General Public License\n+# along with this program. If not, see <http://www.gnu.org/licenses/>.\n+\n+use strict;\n+use warnings;\n+use IO::Handle;\n+use Data::Dumper;\n+use Getopt::Long;\n+use Carp;\n+\n+##\n+## This program splits a FASTQ/FASTA file into several smaller files,\n+## Based on barcode matching.\n+##\n+## run with "--help" for usage information\n+##\n+## Assaf Gordon <assafgordon@gmail.com> , 11sep2008\n+\n+# Forward declarations\n+sub load_barcode_file ($);\n+sub parse_command_line ;\n+sub match_sequences ;\n+sub mismatch_count($$) ;\n+sub print_results;\n+sub open_and_detect_input_format;\n+sub read_record;\n+sub write_record($);\n+sub usage();\n+\n+# Global flags and arguments, \n+# Set by command line argumens\n+my $barcode_file ;\n+my $barcodes_at_eol = 0 ;\n+my $barcodes_at_bol = 0 ;\n+my $exact_match = 0 ;\n+my $allow_partial_overlap = 0;\n+my $allowed_mismatches = 1;\n+my $newfile_suffix = \'\';\n+my $newfile_prefix ;\n+my $quiet = 0 ;\n+my $debug = 0 ;\n+my $fastq_format = 1;\n+\n+# Global variables \n+# Populated by \'create_output_files\'\n+my %filenames;\n+my %files;\n+my %counts = ( \'unmatched\' => 0 );\n+my $barcodes_length;\n+my @barcodes;\n+my $input_file_io;\n+\n+\n+# The Four lines per record in FASTQ format.\n+# (when using FASTA format, only the first two are used)\n+my $seq_name;\n+my $seq_bases;\n+my $seq_name2;\n+my $seq_qualities;\n+\n+\n+#\n+# Start of Program\n+#\n+parse_command_line ;\n+\n+load_barcode_file ( $barcode_file ) ;\n+\n+open_and_detect_input_format;\n+\n+match_sequences ;\n+\n+print_results unless $quiet;\n+\n+#\n+# End of program\n+#\n+\n+\n+\n+\n+\n+\n+\n+\n+sub parse_command_line {\n+\tmy $help;\n+\n+\tusage() if (scalar @ARGV==0);\n+\n+\tmy $result = GetOptions ( "bcfile=s" => \\$barcode_file,\n+\t\t\t\t "eol" => \\$barcodes_at_eol,\n+\t\t\t\t "bol" => \\$barcodes_at_bol,\n+\t\t\t\t "exact" => \\$exact_match,\n+\t\t\t\t "prefix=s" => \\$newfile_prefix,\n+\t\t\t\t "suffix=s" => \\$newfile_suffix,\n+\t\t\t\t "quiet" => \\$quiet, \n+\t\t\t\t "partial=i" => \\$allow_partial_overlap,\n+\t\t\t\t "debug" => \\$debug,\n+\t\t\t\t "mismatches=i" => \\$allowed_mismatches,\n+\t\t\t\t "help" => \\$help\n+\t\t\t\t ) ;\n+\t\n+\tusage() if ($help);\n+\n+\tdie "Error: barcode file not specified (use \'--bcfile [FILENAME]\')\\n" unless defined $barcode_file;\n+\tdie "Error: prefix path/filename not specified (use \'--prefix [PATH]\')\\n" unless defined $newfile_prefix;\n+\n+\tif ($barcodes_at_bol == $barcodes_at_eol) {\n+\t\tdie "Error: can\'t specify both --eol & --bol\\n" if $barcodes_at_eol;\n+\t\tdie "Error: must specify either --eol or --bol\\n" ;\n+\t}\n+\n+\tdie "Error: invalid for value partial matches (valid values are 0 or greater)\\n" if $allow_partial_overlap<0;\n+\n+\t$allowed_mismatches = 0 if $exact_match;\n+\n+\tdie "Error: invalid value for mismatches (valid values are 0 or more)\\n" if ($allowed_mismatches<0);\n+\n+\tdie "Error: partial overlap value ($allow_partial_overlap) bigger than " . \n+\t\t"max. allowed mismatches ($allowed_mismatches)\\n" if ($allow_partial_overlap > $allowed_mismatches);\n+\n+\n+\texit unless $result;\n+}\n+\n+\n+\n+#\n+# Read the barcode file\n+#\n+sub load_barcode_file ($) {\n+\tmy $filename = shift or croak "Missing barcode file name";\n+\n+\topen BCFILE,"<$filename" or die "Error: failed to open barcode file ($filename)\\n";\n+\twhile (<BCFILE>) {\n+\t\tnext if m/^#/;\n+\t\tchomp;\n+\t\tmy ($ident, $barcode) = split ;\n+\n+'..b'arcodes file name. (see explanation below.)\n+--prefix PREFIX\t- File prefix. will be added to the output files. Can be used\n+\t\t to specify output directories.\n+--suffix SUFFIX\t- File suffix (optional). Can be used to specify file\n+\t\t extensions.\n+--bol\t\t- Try to match barcodes at the BEGINNING of sequences.\n+\t\t (What biologists would call the 5\' end, and programmers\n+\t\t would call index 0.)\n+--eol\t\t- Try to match barcodes at the END of sequences.\n+\t\t (What biologists would call the 3\' end, and programmers\n+\t\t would call the end of the string.)\n+\t\t NOTE: one of --bol, --eol must be specified, but not both.\n+--mismatches N\t- Max. number of mismatches allowed. default is 1.\n+--exact\t\t- Same as \'--mismatches 0\'. If both --exact and --mismatches \n+\t\t are specified, \'--exact\' takes precedence.\n+--partial N\t- Allow partial overlap of barcodes. (see explanation below.)\n+\t\t (Default is not partial matching)\n+--quiet\t\t- Don\'t print counts and summary at the end of the run.\n+\t\t (Default is to print.)\n+--debug\t\t- Print lots of useless debug information to STDERR.\n+--help\t\t- This helpful help screen.\n+\n+Example (Assuming \'s_2_100.txt\' is a FASTQ file, \'mybarcodes.txt\' is \n+the barcodes file):\n+\n+ \\$ cat s_2_100.txt | $0 --bcfile mybarcodes.txt --bol --mismatches 2 \\\\\n+ \t--prefix /tmp/bla_ --suffix ".txt"\n+\n+Barcode file format\n+-------------------\n+Barcode files are simple text files. Each line should contain an identifier \n+(descriptive name for the barcode), and the barcode itself (A/C/G/T), \n+separated by a TAB character. Example:\n+\n+ #This line is a comment (starts with a \'number\' sign)\n+ BC1 GATCT\n+ BC2 ATCGT\n+ BC3 GTGAT\n+ BC4 TGTCT\n+\n+For each barcode, a new FASTQ file will be created (with the barcode\'s \n+identifier as part of the file name). Sequences matching the barcode \n+will be stored in the appropriate file.\n+\n+Running the above example (assuming "mybarcodes.txt" contains the above \n+barcodes), will create the following files:\n+\t/tmp/bla_BC1.txt\n+\t/tmp/bla_BC2.txt\n+\t/tmp/bla_BC3.txt\n+\t/tmp/bla_BC4.txt\n+\t/tmp/bla_unmatched.txt\n+The \'unmatched\' file will contain all sequences that didn\'t match any barcode.\n+\n+Barcode matching\n+----------------\n+\n+** Without partial matching:\n+\n+Count mismatches between the FASTA/Q sequences and the barcodes.\n+The barcode which matched with the lowest mismatches count (providing the\n+count is small or equal to \'--mismatches N\') \'gets\' the sequences.\n+\n+Example (using the above barcodes):\n+Input Sequence:\n+ GATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG\n+\n+Matching with \'--bol --mismatches 1\':\n+ GATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG\n+ GATCT (1 mismatch, BC1)\n+ ATCGT (4 mismatches, BC2)\n+ GTGAT (3 mismatches, BC3)\n+ TGTCT (3 mismatches, BC4)\n+\n+This sequence will be classified as \'BC1\' (it has the lowest mismatch count).\n+If \'--exact\' or \'--mismatches 0\' were specified, this sequence would be \n+classified as \'unmatched\' (because, although BC1 had the lowest mismatch count,\n+it is above the maximum allowed mismatches).\n+\n+Matching with \'--eol\' (end of line) does the same, but from the other side\n+of the sequence.\n+\n+** With partial matching (very similar to indels):\n+\n+Same as above, with the following addition: barcodes are also checked for\n+partial overlap (number of allowed non-overlapping bases is \'--partial N\').\n+\n+Example:\n+Input sequence is ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG\n+(Same as above, but note the missing \'G\' at the beginning.)\n+\n+Matching (without partial overlapping) against BC1 yields 4 mismatches:\n+ ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG\n+ GATCT (4 mismatches)\n+\n+Partial overlapping would also try the following match:\n+ -ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG\n+ GATCT (1 mismatch)\n+\n+Note: scoring counts a missing base as a mismatch, so the final\n+mismatch count is 2 (1 \'real\' mismatch, 1 \'missing base\' mismatch).\n+If running with \'--mismatches 2\' (meaning allowing upto 2 mismatches) - this \n+seqeunce will be classified as BC1.\n+\n+EOF\n+\n+exit 1;\n+}\n' |
b |
diff -r 000000000000 -r cb08a27e5fc2 sff2fastq |
b |
Binary file sff2fastq has changed |
b |
diff -r 000000000000 -r cb08a27e5fc2 trim.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/trim.py Mon Aug 29 05:44:57 2016 -0400 |
[ |
@@ -0,0 +1,59 @@ +import argparse + +#docs.python.org/dev/library/argparse.html +parser = argparse.ArgumentParser() +parser.add_argument("--input", help="Input fasta") +parser.add_argument("--output", help="Output fasta") +parser.add_argument("--start", help="How many nucleotides to trim from the start", type=int) +parser.add_argument("--end", help="How many nucleotides to trim from the end", type=int) + +args = parser.parse_args() +start = int(args.start) +end = int(args.end) + +print args.input +print args.output +print start +print end + +if end <= 0 and start <= 0: + import shutil + shutil.copy(args.input, args.output) + import sys + sys.exit() + + + +currentSeq = "" +currentId = "" + +if end is 0: + with open(args.input, 'r') as i: + with open(args.output, 'w') as o: + for line in i.readlines(): + if line[0] is ">": + currentSeq = currentSeq[start:] + if currentSeq is not "" and currentId is not "": + o.write(currentId) + o.write(currentSeq + "\n") + currentId = line + currentSeq = "" + else: + currentSeq += line.rstrip() + o.write(currentId) + o.write(currentSeq[start:] + "\n") +else: + with open(args.input, 'r') as i: + with open(args.output, 'w') as o: + for line in i.readlines(): + if line[0] is ">": + currentSeq = currentSeq[start:-end] + if currentSeq is not "" and currentId is not "": + o.write(currentId) + o.write(currentSeq + "\n") + currentId = line + currentSeq = "" + else: + currentSeq += line.rstrip() + o.write(currentId) + o.write(currentSeq[start:-end] + "\n") |
b |
diff -r 000000000000 -r cb08a27e5fc2 wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/wrapper.sh Mon Aug 29 05:44:57 2016 -0400 |
[ |
@@ -0,0 +1,67 @@ +#!/bin/bash +input="$1" +output="$2" +outDir="$3" +mkdir "$outDir" +EOL="$4" +mismatches="$5" +partial="$6" +name=$(basename "$7") +ext="${name##*.}" +name="${name%.*}" +name="${name// /_}" +prefix="${name}_" +dir="$(cd "$(dirname "$0")" && pwd)" + +unzip $dir/fastqc_v0.11.2.zip -d $PWD/ > $PWD/unziplog.log +chmod 755 $PWD/FastQC/fastqc + +declare -A trim_start +declare -A trim_end +for ((i=8;i<=$#;i=i+4)) +do + j=$((i+1)) + start_int=$((i+2)) + end_int=$((i+3)) + id="${!i}" + echo "$id, ${start_int}, ${end_int}" + trim_start[$id]=${!start_int} + trim_end[$id]=${!end_int} + echo -e "$id\t${!j}" >> $outDir/barcodes.txt + +done +trim_start["unmatched"]=0 +trim_end["unmatched"]=0 + +echo "trim_start = ${trim_start[@]}" +echo "trim_end = ${trim_end[@]}" + +workdir=$PWD +cd $outDir +echo "$3" +filetype=`file $input` +result="" +if [[ $filetype == *ASCII* ]] +then + result=`cat $input | $dir/fastx_barcode_splitter.pl --bcfile $outDir/barcodes.txt --prefix "$prefix" --suffix ".fastq" --$EOL --mismatches $mismatches --partial $partial` +else + result=`$dir/sff2fastq $input | $dir/fastx_barcode_splitter.pl --bcfile $outDir/barcodes.txt --prefix "$prefix" --suffix ".fastq" --$EOL --mismatches $mismatches --partial $partial` +fi + +echo "$result" | tail -n +2 | sed 's/\t/,/g' > output.txt +echo "<html><head><title>$name demultiplex</title></head><body><table border='1'><thead><tr><th>ID</th><th>Count</th><th>FASTQ</th><th>FASTA</th><th>Trimmed FASTA</th><th>FASTQC</th></tr></thead><tbody>" >> $output +while IFS=, read barcode count location + do + if [ "total" == "$barcode" ] + then + echo "<tr><td>$barcode</td><td>$count</td><td></td><td></td><td></td><td></td><td></td><td></td></tr>" >> $output + break + fi + file="${name}_${barcode}" + mkdir "$outDir/fastqc_$barcode" + $workdir/FastQC/fastqc "$file.fastq" -o "$outDir" 2> /dev/null + cat "$file.fastq" | awk 'NR%4==1{printf ">%s\n", substr($0,2)}NR%4==2{print}' > "$file.fasta" + python $dir/trim.py --input "$file.fasta" --output "${file}_trimmed.fasta" --start "${trim_start[$barcode]}" --end "${trim_end[$barcode]}" + echo "<tr><td>$barcode</td><td>$count</td><td><a href='$file.fastq'>$file.fastq</a></td><td><a href='$file.fasta'>$file.fasta</a></td><td><a href='${file}_trimmed.fasta'>${file}_trimmed.fasta</a></td><td><a href='${name}_${barcode}_fastqc.html'>Report</a></td></tr>" >> $output +done < output.txt +echo "</tbody></body></html>" >> $output |