# HG changeset patch
# User davidvanzessen
# Date 1472463897 14400
# Node ID cb08a27e5fc28a88594b77744a6c7fed77259159
Uploaded
diff -r 000000000000 -r cb08a27e5fc2 demultiplex.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/demultiplex.xml Mon Aug 29 05:44:57 2016 -0400
@@ -0,0 +1,349 @@
+
+
+
+ fastx_toolkit
+
+
+ wrapper.sh "$input" "$out_file" "$out_file.files_path" "$where" "$mismatches" "$partial" "$input.name"
+ #for $i, $b in enumerate($barcodes)
+ "$b.id"
+ "$b.mid"
+ "$b.trim_start"
+ "$b.trim_end"
+ #end for
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+- Splitting sff or fastq files into FASTQ, FASTA and (optional) trimmed FASTA files with a FASTQC report on the FASTQ file, this tool uses:
+- sff2fastq (https://github.com/indraniel/sff2fastq) to extract a fastq file.
+- fastx_barcode_splitter.pl (http://hannonlab.cshl.edu/fastx_toolkit/commandline.html) to demultiplex.
+- fastqc (http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) to provide analysis of the fastq files.
+
+
+
diff -r 000000000000 -r cb08a27e5fc2 fastqc_v0.11.2.zip
Binary file fastqc_v0.11.2.zip has changed
diff -r 000000000000 -r cb08a27e5fc2 fastx_barcode_splitter.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fastx_barcode_splitter.pl Mon Aug 29 05:44:57 2016 -0400
@@ -0,0 +1,472 @@
+#!/usr/bin/perl
+
+# FASTX-toolkit - FASTA/FASTQ preprocessing tools.
+# Copyright (C) 2009-2013 A. Gordon (assafgordon@gmail.com)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+use strict;
+use warnings;
+use IO::Handle;
+use Data::Dumper;
+use Getopt::Long;
+use Carp;
+
+##
+## This program splits a FASTQ/FASTA file into several smaller files,
+## Based on barcode matching.
+##
+## run with "--help" for usage information
+##
+## Assaf Gordon , 11sep2008
+
+# Forward declarations
+sub load_barcode_file ($);
+sub parse_command_line ;
+sub match_sequences ;
+sub mismatch_count($$) ;
+sub print_results;
+sub open_and_detect_input_format;
+sub read_record;
+sub write_record($);
+sub usage();
+
+# Global flags and arguments,
+# Set by command line argumens
+my $barcode_file ;
+my $barcodes_at_eol = 0 ;
+my $barcodes_at_bol = 0 ;
+my $exact_match = 0 ;
+my $allow_partial_overlap = 0;
+my $allowed_mismatches = 1;
+my $newfile_suffix = '';
+my $newfile_prefix ;
+my $quiet = 0 ;
+my $debug = 0 ;
+my $fastq_format = 1;
+
+# Global variables
+# Populated by 'create_output_files'
+my %filenames;
+my %files;
+my %counts = ( 'unmatched' => 0 );
+my $barcodes_length;
+my @barcodes;
+my $input_file_io;
+
+
+# The Four lines per record in FASTQ format.
+# (when using FASTA format, only the first two are used)
+my $seq_name;
+my $seq_bases;
+my $seq_name2;
+my $seq_qualities;
+
+
+#
+# Start of Program
+#
+parse_command_line ;
+
+load_barcode_file ( $barcode_file ) ;
+
+open_and_detect_input_format;
+
+match_sequences ;
+
+print_results unless $quiet;
+
+#
+# End of program
+#
+
+
+
+
+
+
+
+
+sub parse_command_line {
+ my $help;
+
+ usage() if (scalar @ARGV==0);
+
+ my $result = GetOptions ( "bcfile=s" => \$barcode_file,
+ "eol" => \$barcodes_at_eol,
+ "bol" => \$barcodes_at_bol,
+ "exact" => \$exact_match,
+ "prefix=s" => \$newfile_prefix,
+ "suffix=s" => \$newfile_suffix,
+ "quiet" => \$quiet,
+ "partial=i" => \$allow_partial_overlap,
+ "debug" => \$debug,
+ "mismatches=i" => \$allowed_mismatches,
+ "help" => \$help
+ ) ;
+
+ usage() if ($help);
+
+ die "Error: barcode file not specified (use '--bcfile [FILENAME]')\n" unless defined $barcode_file;
+ die "Error: prefix path/filename not specified (use '--prefix [PATH]')\n" unless defined $newfile_prefix;
+
+ if ($barcodes_at_bol == $barcodes_at_eol) {
+ die "Error: can't specify both --eol & --bol\n" if $barcodes_at_eol;
+ die "Error: must specify either --eol or --bol\n" ;
+ }
+
+ die "Error: invalid for value partial matches (valid values are 0 or greater)\n" if $allow_partial_overlap<0;
+
+ $allowed_mismatches = 0 if $exact_match;
+
+ die "Error: invalid value for mismatches (valid values are 0 or more)\n" if ($allowed_mismatches<0);
+
+ die "Error: partial overlap value ($allow_partial_overlap) bigger than " .
+ "max. allowed mismatches ($allowed_mismatches)\n" if ($allow_partial_overlap > $allowed_mismatches);
+
+
+ exit unless $result;
+}
+
+
+
+#
+# Read the barcode file
+#
+sub load_barcode_file ($) {
+ my $filename = shift or croak "Missing barcode file name";
+
+ open BCFILE,"<$filename" or die "Error: failed to open barcode file ($filename)\n";
+ while () {
+ next if m/^#/;
+ chomp;
+ my ($ident, $barcode) = split ;
+
+ $barcode = uc($barcode);
+
+ # Sanity checks on the barcodes
+ die "Error: bad data at barcode file ($filename) line $.\n" unless defined $barcode;
+ die "Error: bad barcode value ($barcode) at barcode file ($filename) line $.\n"
+ unless $barcode =~ m/^[AGCT]+$/;
+
+ die "Error: bad identifier value ($ident) at barcode file ($filename) line $. (must be alphanumeric)\n"
+ unless $ident =~ m/^\w+$/;
+
+ die "Error: badcode($ident, $barcode) is shorter or equal to maximum number of " .
+ "mismatches ($allowed_mismatches). This makes no sense. Specify fewer mismatches.\n"
+ if length($barcode)<=$allowed_mismatches;
+
+ $barcodes_length = length($barcode) unless defined $barcodes_length;
+ die "Error: found barcodes in different lengths. this feature is not supported yet.\n"
+ unless $barcodes_length == length($barcode);
+
+ push @barcodes, [$ident, $barcode];
+
+ if ($allow_partial_overlap>0) {
+ foreach my $i (1 .. $allow_partial_overlap) {
+ substr $barcode, ($barcodes_at_bol)?0:-1, 1, '';
+ push @barcodes, [$ident, $barcode];
+ }
+ }
+ }
+ close BCFILE;
+
+ if ($debug) {
+ print STDERR "barcode\tsequence\n";
+ foreach my $barcoderef (@barcodes) {
+ my ($ident, $seq) = @{$barcoderef};
+ print STDERR $ident,"\t", $seq ,"\n";
+ }
+ }
+}
+
+# Create one output file for each barcode.
+# (Also create a file for the dummy 'unmatched' barcode)
+sub create_output_files {
+ my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers;
+ $barcodes{'unmatched'} = 1 ;
+
+ foreach my $ident (keys %barcodes) {
+ my $new_filename = $newfile_prefix . $ident . $newfile_suffix;
+ $filenames{$ident} = $new_filename;
+ open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n";
+ $files{$ident} = $file ;
+ }
+}
+
+sub match_sequences {
+
+ my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers;
+ $barcodes{'unmatched'} = 1 ;
+
+ #reset counters
+ foreach my $ident ( keys %barcodes ) {
+ $counts{$ident} = 0;
+ }
+
+ create_output_files;
+
+ # Read file FASTQ file
+ # split accotding to barcodes
+ while ( read_record ) {
+ chomp $seq_bases;
+
+ print STDERR "sequence $seq_bases: \n" if $debug;
+
+ my $best_barcode_mismatches_count = $barcodes_length;
+ my $best_barcode_ident = undef;
+
+ #Try all barcodes, find the one with the lowest mismatch count
+ foreach my $barcoderef (@barcodes) {
+ my ($ident, $barcode) = @{$barcoderef};
+
+ # Get DNA fragment (in the length of the barcodes)
+ # The barcode will be tested only against this fragment
+ # (no point in testing the barcode against the whole sequence)
+ my $sequence_fragment;
+ if ($barcodes_at_bol) {
+ $sequence_fragment = substr $seq_bases, 0, $barcodes_length;
+ } else {
+ $sequence_fragment = substr $seq_bases, - $barcodes_length;
+ }
+
+ my $mm = mismatch_count($sequence_fragment, $barcode) ;
+
+ # if this is a partial match, add the non-overlap as a mismatch
+ # (partial barcodes are shorter than the length of the original barcodes)
+ $mm += ($barcodes_length - length($barcode));
+
+ if ( $mm < $best_barcode_mismatches_count ) {
+ $best_barcode_mismatches_count = $mm ;
+ $best_barcode_ident = $ident ;
+ }
+ }
+
+ $best_barcode_ident = 'unmatched'
+ if ( (!defined $best_barcode_ident) || $best_barcode_mismatches_count>$allowed_mismatches) ;
+
+ print STDERR "sequence $seq_bases matched barcode: $best_barcode_ident\n" if $debug;
+
+ $counts{$best_barcode_ident}++;
+
+ #get the file associated with the matched barcode.
+ #(note: there's also a file associated with 'unmatched' barcode)
+ my $file = $files{$best_barcode_ident};
+
+ write_record($file);
+ }
+}
+
+#Quickly calculate hamming distance between two strings
+#
+#NOTE: Strings must be same length.
+# returns number of different characters.
+#see http://www.perlmonks.org/?node_id=500235
+sub mismatch_count($$) { length( $_[ 0 ] ) - ( ( $_[ 0 ] ^ $_[ 1 ] ) =~ tr[\0][\0] ) }
+
+
+
+sub print_results
+{
+ print "Barcode\tCount\tLocation\n";
+ my $total = 0 ;
+ foreach my $ident (sort keys %counts) {
+ print $ident, "\t", $counts{$ident},"\t",$filenames{$ident},"\n";
+ $total += $counts{$ident};
+ }
+ print "total\t",$total,"\n";
+}
+
+
+sub read_record
+{
+ $seq_name = $input_file_io->getline();
+
+ return undef unless defined $seq_name; # End of file?
+
+ $seq_bases = $input_file_io->getline();
+ die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases;
+
+ # If using FASTQ format, read two more lines
+ if ($fastq_format) {
+ $seq_name2 = $input_file_io->getline();
+ die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2;
+
+ $seq_qualities = $input_file_io->getline();
+ die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities;
+ }
+ return 1;
+}
+
+sub write_record($)
+{
+ my $file = shift;
+
+ croak "Bad file handle" unless defined $file;
+
+ print $file $seq_name;
+ print $file $seq_bases,"\n";
+
+ #if using FASTQ format, write two more lines
+ if ($fastq_format) {
+ print $file $seq_name2;
+ print $file $seq_qualities;
+ }
+}
+
+sub open_and_detect_input_format
+{
+ $input_file_io = new IO::Handle;
+ die "Failed to open STDIN " unless $input_file_io->fdopen(fileno(STDIN),"r");
+
+ # Get the first characeter, and push it back
+ my $first_char = $input_file_io->getc();
+ $input_file_io->ungetc(ord $first_char);
+
+ if ($first_char eq '>') {
+ # FASTA format
+ $fastq_format = 0 ;
+ print STDERR "Detected FASTA format\n" if $debug;
+ } elsif ($first_char eq '@') {
+ # FASTQ format
+ $fastq_format = 1;
+ print STDERR "Detected FASTQ format\n" if $debug;
+ } else {
+ die "Error: unknown file format. First character = '$first_char' (expecting > or \@)\n";
+ }
+}
+
+sub usage()
+{
+print<":
+ currentSeq = currentSeq[start:]
+ if currentSeq is not "" and currentId is not "":
+ o.write(currentId)
+ o.write(currentSeq + "\n")
+ currentId = line
+ currentSeq = ""
+ else:
+ currentSeq += line.rstrip()
+ o.write(currentId)
+ o.write(currentSeq[start:] + "\n")
+else:
+ with open(args.input, 'r') as i:
+ with open(args.output, 'w') as o:
+ for line in i.readlines():
+ if line[0] is ">":
+ currentSeq = currentSeq[start:-end]
+ if currentSeq is not "" and currentId is not "":
+ o.write(currentId)
+ o.write(currentSeq + "\n")
+ currentId = line
+ currentSeq = ""
+ else:
+ currentSeq += line.rstrip()
+ o.write(currentId)
+ o.write(currentSeq[start:-end] + "\n")
diff -r 000000000000 -r cb08a27e5fc2 wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/wrapper.sh Mon Aug 29 05:44:57 2016 -0400
@@ -0,0 +1,67 @@
+#!/bin/bash
+input="$1"
+output="$2"
+outDir="$3"
+mkdir "$outDir"
+EOL="$4"
+mismatches="$5"
+partial="$6"
+name=$(basename "$7")
+ext="${name##*.}"
+name="${name%.*}"
+name="${name// /_}"
+prefix="${name}_"
+dir="$(cd "$(dirname "$0")" && pwd)"
+
+unzip $dir/fastqc_v0.11.2.zip -d $PWD/ > $PWD/unziplog.log
+chmod 755 $PWD/FastQC/fastqc
+
+declare -A trim_start
+declare -A trim_end
+for ((i=8;i<=$#;i=i+4))
+do
+ j=$((i+1))
+ start_int=$((i+2))
+ end_int=$((i+3))
+ id="${!i}"
+ echo "$id, ${start_int}, ${end_int}"
+ trim_start[$id]=${!start_int}
+ trim_end[$id]=${!end_int}
+ echo -e "$id\t${!j}" >> $outDir/barcodes.txt
+
+done
+trim_start["unmatched"]=0
+trim_end["unmatched"]=0
+
+echo "trim_start = ${trim_start[@]}"
+echo "trim_end = ${trim_end[@]}"
+
+workdir=$PWD
+cd $outDir
+echo "$3"
+filetype=`file $input`
+result=""
+if [[ $filetype == *ASCII* ]]
+then
+ result=`cat $input | $dir/fastx_barcode_splitter.pl --bcfile $outDir/barcodes.txt --prefix "$prefix" --suffix ".fastq" --$EOL --mismatches $mismatches --partial $partial`
+else
+ result=`$dir/sff2fastq $input | $dir/fastx_barcode_splitter.pl --bcfile $outDir/barcodes.txt --prefix "$prefix" --suffix ".fastq" --$EOL --mismatches $mismatches --partial $partial`
+fi
+
+echo "$result" | tail -n +2 | sed 's/\t/,/g' > output.txt
+echo "$name demultiplexID | Count | FASTQ | FASTA | Trimmed FASTA | FASTQC |
" >> $output
+while IFS=, read barcode count location
+ do
+ if [ "total" == "$barcode" ]
+ then
+ echo "$barcode | $count | | | | | | |
" >> $output
+ break
+ fi
+ file="${name}_${barcode}"
+ mkdir "$outDir/fastqc_$barcode"
+ $workdir/FastQC/fastqc "$file.fastq" -o "$outDir" 2> /dev/null
+ cat "$file.fastq" | awk 'NR%4==1{printf ">%s\n", substr($0,2)}NR%4==2{print}' > "$file.fasta"
+ python $dir/trim.py --input "$file.fasta" --output "${file}_trimmed.fasta" --start "${trim_start[$barcode]}" --end "${trim_end[$barcode]}"
+ echo "$barcode | $count | $file.fastq | $file.fasta | ${file}_trimmed.fasta | Report |
" >> $output
+done < output.txt
+echo "" >> $output