Mercurial > repos > bgruening > trim_galore
view trim_galore @ 6:11962ce40855 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/trim_galore commit 9198b904ef37fe46007256f1734c33de6d23331b-dirty
author | bgruening |
---|---|
date | Wed, 07 Oct 2015 08:39:59 -0400 |
parents | 2c1f0fe810f7 |
children |
line wrap: on
line source
#!/usr/bin/perl use strict; use warnings; use Getopt::Long; use IPC::Open3; use File::Spec; use File::Basename; use Cwd; ## This program is Copyright (C) 2012-14, Felix Krueger (felix.krueger@babraham.ac.uk) ## This program is free software: you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation, either version 3 of the License, or ## (at your option) any later version. ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## You should have received a copy of the GNU General Public License ## along with this program. If not, see <http://www.gnu.org/licenses/>. ## this script is taking in FastQ sequences and trims them using Cutadapt ## last modified on 01 May 2015 my $DOWARN = 1; # print on screen warning and text by default BEGIN { $SIG{'__WARN__'} = sub { warn $_[0] if $DOWARN } }; my $trimmer_version = '0.4.0'; my ($cutoff,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$a2,$error_rate,$output_dir,$no_report_file,$dont_gzip,$clip_r1,$clip_r2,$three_prime_clip_r1,$three_prime_clip_r2,$nextera,$small_rna,$path_to_cutadapt,$illumina) = process_commandline(); my @filenames = @ARGV; die "\nPlease provide the filename(s) of one or more FastQ file(s) to launch Trim Galore!\n USAGE: 'trim_galore [options] <filename(s)>' or 'trim_galore --help' for more options\n\n" unless (@filenames); file_sanity_check($filenames[0]); ######################################################################## my $path_to_fastqc = 'fastqc'; # Before we start let's have quick look if Cutadapt seems to be working with the path information provided # To change the path to Cutadapt use --path_to_cutadapt /full/path/to/the/Cutadapt/executable if(defined $path_to_cutadapt){ warn "Path to Cutadapt set as: '$path_to_cutadapt' (user defined)\n"; # we'll simply use this } else{ $path_to_cutadapt = 'cutadapt'; # default, assuming it is in the PATH warn "Path to Cutadapt set as: '$path_to_cutadapt' (default)\n"; } my $cutadapt_version; my $return = system "$path_to_cutadapt --version"; #>/dev/null 2>&1"; if ($return == -1){ die "Failed to execute Cutadapt porperly. Please install Cutadapt first and make sure it is in the PATH, or specify the path to the Cutadapt executable using --path_to_cutadapt /path/to/cutadapt\n\n"; } else{ warn "Cutadapt seems to be working fine (tested command '$path_to_cutadapt --version')\n"; $cutadapt_version = `$path_to_cutadapt --version`; chomp $cutadapt_version; # warn "Cutadapt version: $cutadapt_version\n"; } ######################################################################## sub autodetect_adapter_type{ warn "\n\nAUTO-DETECTING ADAPTER TYPE\n===========================\n"; warn "Attempting to auto-detect adapter type from the first 1 million sequences of the first file (>> $ARGV[0] <<)\n\n"; if ($ARGV[0] =~ /gz$/){ open (AUTODETECT,"zcat $ARGV[0] |") or die "Failed to read from file $ARGV[0]\n"; } else{ open (AUTODETECT,$ARGV[0]) or die "Failed to read from file $ARGV[0]\n"; } my %adapters; $adapters{'Illumina'} -> {seq} = 'AGATCGGAAGAGC'; $adapters{'Illumina'} -> {count}= 0; $adapters{'Illumina'} -> {name}= 'Illumina TruSeq, Sanger iPCR; auto-detected'; $adapters{'Nextera'} -> {seq} = 'CTGTCTCTTATA'; $adapters{'Nextera'} -> {count}= 0; $adapters{'Nextera'} -> {name}= 'Nextera Transposase sequence; auto-detected'; $adapters{'smallRNA'} -> {seq} = 'ATGGAATTCTCG'; $adapters{'smallRNA'} -> {count}= 0; $adapters{'smallRNA'} -> {name}= 'Illumina small RNA adapter; auto-detected'; # we will read the first 1 million sequences, or until the end of the file whatever comes first, and then use the adapter that for trimming which was found to occcur most often my $count = 0; while (1){ my $line1 = <AUTODETECT>; my $line2 = <AUTODETECT>; my $line3 = <AUTODETECT>; my $line4 = <AUTODETECT>; last unless ($line4); $count++; last if ($count == 1000000); chomp $line2; $adapters{'Illumina'}->{count}++ unless (index($line2,'AGATCGGAAGAGC')== -1); $adapters{'Nextera'} ->{count}++ unless (index($line2,'CTGTCTCTTATA') == -1); $adapters{'smallRNA'}->{count}++ unless (index($line2,'ATGGAATTCTCG') == -1); } my $highest; my $second; my $seq; my $adapter_name; warn "Found perfect matches for the following adapter sequences:\nAdapter type\tCount\tSequence\tSequences analysed\tPercentage\n"; foreach my $adapter (sort {$adapters{$b}->{count}<=>$adapters{$a}->{count}} keys %adapters){ my $percentage = sprintf("%.2f",$adapters{$adapter}->{count}/$count*100); warn "$adapter\t$adapters{$adapter}->{count}\t$adapters{$adapter}->{seq}\t$count\t$percentage\n"; unless (defined $highest){ $highest = $adapter; $seq = $adapters{$adapter}->{seq}; $adapter_name = $adapters{$adapter}->{name}; next; } unless (defined $second){ $second = $adapter; } } # using the highest occurrence as adapter to look out for if ($adapters{$highest}->{count} == $adapters{$second}->{count}){ warn "Unable to auto-detect most prominent adapter from the first specified file (count $highest: $adapters{$highest}->{count}, count $second: $adapters{$second}->{second})\n"; if ($adapters{$highest}->{count} == 0){ warn "Defaulting to Illumina universal adapter ( AGATCGGAAGAGC ). Specify -a SEQUENCE to avoid this behavior).\n\n"; $adapter_name = 'Illumina TruSeq, Sanger iPCR; default (inconclusive auto-detection)'; $seq = 'AGATCGGAAGAGC'; } else{ warn "Using $highest adapter for trimming (count: $adapters{$highest}->{count}). Second best hit was $second (count: $adapters{$second}->{count})\n\n"; } } else{ warn "Using $highest adapter for trimming (count: $adapters{$highest}->{count}). Second best hit was $second (count: $adapters{$second}->{count})\n\n"; } close AUTODETECT; return ($seq,$adapter_name); } ### SETTING DEFAULTS UNLESS THEY WERE SPECIFIED unless (defined $cutoff){ $cutoff = 20; } my $phred_score_cutoff = $cutoff; # only relevant for report my $adapter_name = ''; unless (defined $adapter){ if ($nextera){ $adapter = 'CTGTCTCTTATA'; $adapter_name = 'Nextera Transposase sequence; user defined'; } elsif($small_rna){ $adapter = 'ATGGAATTCTCG'; $adapter_name = 'Illumina small RNA adapter; user defined'; } elsif($illumina){ $adapter = 'AGATCGGAAGAGC'; $adapter_name = 'Illumina TruSeq, Sanger iPCR; user defined'; } else{ # default ($adapter,$adapter_name) = autodetect_adapter_type(); } } unless (defined $a2){ # optional adapter for the second read in a pair. Only works for --paired trimming $a2 = ''; } unless (defined $stringency){ $stringency = 1; } if ($phred_encoding == 64){ $cutoff += 31; } my $file_1; my $file_2; foreach my $filename (@ARGV){ trim ($filename); } sub trim{ my $filename = shift; my $output_filename = (split (/\//,$filename))[-1]; my $report = $output_filename; $report =~ s/$/_trimming_report.txt/; if ($no_report_file) { $report = File::Spec->devnull; open (REPORT,'>',$report) or die "Failed to write to file '$report': $!\n"; # warn "Redirecting report output to /dev/null\n"; } else{ open (REPORT,'>',$output_dir.$report) or die "Failed to write to file '$report': $!\n"; warn "Writing report to '$output_dir$report'\n"; } warn "\nSUMMARISING RUN PARAMETERS\n==========================\nInput filename: $filename\n"; print REPORT "\nSUMMARISING RUN PARAMETERS\n==========================\nInput filename: $filename\n"; if ($validate){ # paired-end mode warn "Trimming mode: paired-end\n"; print REPORT "Trimming mode: paired-end\n"; } else{ warn "Trimming mode: single-end\n"; print REPORT "Trimming mode: single-end\n"; } warn "Trim Galore version: $trimmer_version\n"; print REPORT "Trim Galore version: $trimmer_version\n"; warn "Cutadapt version: $cutadapt_version\n"; print REPORT "Cutadapt version: $cutadapt_version\n"; warn "Quality Phred score cutoff: $phred_score_cutoff\n"; print REPORT "Quality Phred score cutoff: $phred_score_cutoff\n"; warn "Quality encoding type selected: ASCII+$phred_encoding\n"; print REPORT "Quality encoding type selected: ASCII+$phred_encoding\n"; warn "Adapter sequence: '$adapter' ($adapter_name)\n"; print REPORT "Adapter sequence: '$adapter' ($adapter_name)\n"; if ($error_rate == 0.1){ warn "Maximum trimming error rate: $error_rate (default)\n"; } else{ warn "Maximum trimming error rate: $error_rate\n"; } print REPORT "Maximum trimming error rate: $error_rate"; if ($error_rate == 0.1){ print REPORT " (default)\n"; } else{ print REPORT "\n"; } if ($a2){ warn "Optional adapter 2 sequence (only used for read 2 of paired-end files): '$a2'\n"; print REPORT "Optional adapter 2 sequence (only used for read 2 of paired-end files): '$a2'\n"; } warn "Minimum required adapter overlap (stringency): $stringency bp\n"; print REPORT "Minimum required adapter overlap (stringency): $stringency bp\n"; if ($validate){ warn "Minimum required sequence length for both reads before a sequence pair gets removed: $length_cutoff bp\n"; print REPORT "Minimum required sequence length for both reads before a sequence pair gets removed: $length_cutoff bp\n"; } else{ warn "Minimum required sequence length before a sequence gets removed: $length_cutoff bp\n"; print REPORT "Minimum required sequence length before a sequence gets removed: $length_cutoff bp\n"; } if ($validate){ # only for paired-end files if ($retain){ # keeping single-end reads if only one end is long enough if ($length_read_1 == 35){ warn "Length cut-off for read 1: $length_read_1 bp (default)\n"; print REPORT "Length cut-off for read 1: $length_read_1 bp (default)\n"; } else{ warn "Length cut-off for read 1: $length_read_1 bp\n"; print REPORT "Length cut-off for read 1: $length_read_1 bp\n"; } if ($length_read_2 == 35){ warn "Length cut-off for read 2: $length_read_2 bb (default)\n"; print REPORT "Length cut-off for read 2: $length_read_2 bp (default)\n"; } else{ warn "Length cut-off for read 2: $length_read_2 bp\n"; print REPORT "Length cut-off for read 2: $length_read_2 bp\n"; } } } if ($rrbs){ warn "File was specified to be an MspI-digested RRBS sample. Sequences with adapter contamination will be trimmed a further 2 bp to remove potential methylation-biased bases from the end-repair reaction\n"; print REPORT "File was specified to be an MspI-digested RRBS sample. Sequences with adapter contamination will be trimmed a further 2 bp to remove potential methylation-biased bases from the end-repair reaction\n"; } if ($non_directional){ warn "File was specified to be a non-directional MspI-digested RRBS sample. Sequences starting with either 'CAA' or 'CGA' will have the first 2 bp trimmed off to remove potential methylation-biased bases from the end-repair reaction\n"; print REPORT "File was specified to be a non-directional MspI-digested RRBS sample. Sequences starting with either 'CAA' or 'CGA' will have the first 2 bp trimmed off to remove potential methylation-biased bases from the end-repair reaction\n"; } if ($trim){ warn "All sequences will be trimmed by 1 bp on their 3' end to avoid problems with invalid paired-end alignments with Bowtie 1\n"; print REPORT "All sequences will be trimmed by 1 bp on their 3' end to avoid problems with invalid paired-end alignments with Bowtie 1\n"; } if ($clip_r1){ warn "All Read 1 sequences will be trimmed by $clip_r1 bp from their 5' end to avoid poor qualities or biases\n"; print REPORT "All Read 1 sequences will be trimmed by $clip_r1 bp from their 5' end to avoid poor qualities or biases\n"; } if ($clip_r2){ warn "All Read 2 sequences will be trimmed by $clip_r2 bp from their 5' end to avoid poor qualities or biases (e.g. M-bias for BS-Seq applications)\n"; print REPORT "All Read 2 sequences will be trimmed by $clip_r2 bp from their 5' end to avoid poor qualities or biases (e.g. M-bias for BS-Seq applications)\n"; } if ($three_prime_clip_r1){ warn "All Read 1 sequences will be trimmed by $three_prime_clip_r1 bp from their 3' end to avoid poor qualities or biases\n"; print REPORT "All Read 1 sequences will be trimmed by $three_prime_clip_r1 bp from their 3' end to avoid poor qualities or biases\n"; } if ($three_prime_clip_r2){ warn "All Read 2 sequences will be trimmed by $three_prime_clip_r2 bp from their 3' end to avoid poor qualities or biases\n"; print REPORT "All Read 2 sequences will be trimmed by $three_prime_clip_r2 bp from their 3' end to avoid poor qualities or biases\n"; } if ($fastqc){ warn "Running FastQC on the data once trimming has completed\n"; print REPORT "Running FastQC on the data once trimming has completed\n"; if ($fastqc_args){ warn "Running FastQC with the following extra arguments: '$fastqc_args'\n"; print REPORT "Running FastQC with the following extra arguments: $fastqc_args\n"; } } if ($keep and $rrbs){ warn "Keeping quality trimmed (but not yet adapter trimmed) intermediate FastQ file\n"; print REPORT "Keeping quality trimmed (but not yet adapter trimmed) intermediate FastQ file\n"; } if ($gzip or $filename =~ /\.gz$/){ $gzip = 1; unless ($dont_gzip){ warn "Output file(s) will be GZIP compressed\n"; print REPORT "Output file will be GZIP compressed\n"; } } warn "\n"; print REPORT "\n"; sleep (3); my $temp; ### Proceeding differently for RRBS and other type of libraries if ($rrbs){ ### Skipping quality filtering for RRBS libraries if a quality cutoff of 0 was specified if ($cutoff == 0){ warn "Quality cutoff selected was 0 - Skipping quality trimming altogether\n\n"; sleep (3); } else{ $temp = $filename; $temp =~ s/^.*\///; # replacing optional file path information $temp =~ s/$/_qual_trimmed.fastq/; open (TEMP,'>',$output_dir.$temp) or die "Can't write to '$temp': $!"; warn " >>> Now performing adaptive quality trimming with a Phred-score cutoff of: $cutoff <<<\n\n"; sleep (3); open (QUAL,"$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -a X $filename |") or die "Can't open pipe to Cutadapt: $!"; my $qual_count = 0; while (1){ my $l1 = <QUAL>; my $seq = <QUAL>; my $l3 = <QUAL>; my $qual = <QUAL>; last unless (defined $qual); $qual_count++; if ($qual_count%10000000 == 0){ warn "$qual_count sequences processed\n"; } print TEMP "$l1$seq$l3$qual"; } warn "\n >>> Quality trimming completed <<<\n$qual_count sequences processed in total\n\n"; close QUAL or die "Unable to close QUAL filehandle: $!\n"; sleep (3); } } if ($output_filename =~ /\.fastq$/){ $output_filename =~ s/\.fastq$/_trimmed.fq/; } elsif ($output_filename =~ /\.fastq\.gz$/){ $output_filename =~ s/\.fastq\.gz$/_trimmed.fq/; } elsif ($output_filename =~ /\.fq$/){ $output_filename =~ s/\.fq$/_trimmed.fq/; } elsif ($output_filename =~ /\.fq\.gz$/){ $output_filename =~ s/\.fq\.gz$/_trimmed.fq/; } else{ $output_filename =~ s/$/_trimmed.fq/; } if ($gzip or $filename =~ /\.gz$/){ if ($dont_gzip){ open (OUT,'>',$output_dir.$output_filename) or die "Can't open '$output_filename': $!\n"; # don't need to gzip intermediate file } else{ ### 6 Jan 2014: had a request to also gzip intermediate files to save disk space # if ($validate){ # open (OUT,'>',$output_dir.$output_filename) or die "Can't open '$output_filename': $!\n"; # don't need to gzip intermediate file # } $output_filename .= '.gz'; open (OUT,"| gzip -c - > ${output_dir}${output_filename}") or die "Can't write to '$output_filename': $!\n"; } } else{ open (OUT,'>',$output_dir.$output_filename) or die "Can't open '$output_filename': $!\n"; } warn "Writing final adapter and quality trimmed output to $output_filename\n\n"; my $count = 0; my $too_short = 0; my $quality_trimmed = 0; my $rrbs_trimmed = 0; my $rrbs_trimmed_start = 0; my $CAA = 0; my $CGA = 0; my $pid; if ($rrbs and $cutoff != 0){ ### optionally using 2 different adapters for read 1 and read 2 if ($validate and $a2){ ### Figure out whether current file counts as read 1 or read 2 of paired-end files if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair warn "\n >>> Now performing adapter trimming for the adapter sequence: '$adapter' from file $temp <<< \n"; sleep (3); $pid = open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n"; } else{ # this is read 2 of a pair warn "\n >>> Now performing adapter trimming for the adapter sequence: '$a2' from file $temp <<< \n"; sleep (3); $pid = open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $a2 $output_dir$temp") or die "Failed to launch Cutadapt: $!\n"; } } ### Using the same adapter for both read 1 and read 2 else{ warn "\n >>> Now performing adapter trimming for the adapter sequence: '$adapter' from file $temp <<< \n"; sleep (3); $pid = open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n"; } close WRITER or die $!; # not needed open (QUAL,"$output_dir$temp") or die $!; # quality trimmed file if ($filename =~ /\.gz$/){ open (IN,"zcat $filename |") or die $!; # original, untrimmed file } else{ open (IN,$filename) or die $!; # original, untrimmed file } while (1){ # we can process the output from Cutadapt and the original input 1 by 1 to decide if the adapter has been removed or not my $l1 = <TRIM>; my $seq = <TRIM>; # adapter trimmed sequence my $l3 = <TRIM>; my $qual = <TRIM>; $_ = <IN>; # irrelevant my $original_seq = <IN>; $_ = <IN>; # irrelevant $_ = <IN>; # irrelevant $_ = <QUAL>; # irrelevant my $qual_trimmed_seq = <QUAL>; $_ = <QUAL>; # irrelevant my $qual_trimmed_qual = <QUAL>; last unless (defined $qual and defined $qual_trimmed_qual); # could be empty strings $count++; if ($count%10000000 == 0){ warn "$count sequences processed\n"; } chomp $seq; chomp $qual; chomp $qual_trimmed_seq; chomp $original_seq; my $quality_trimmed_seq_length = length $qual_trimmed_seq; if (length $original_seq > length $qual_trimmed_seq){ ++$quality_trimmed; } my $nd = 0; ### NON-DIRECTIONAL RRBS if ($non_directional){ if (length$seq > 2){ if ($seq =~ /^CAA/){ ++$CAA; $seq = substr ($seq,2,length($seq)-2); $qual = substr ($qual,2,length($qual)-2); ++$rrbs_trimmed_start; $nd = 1; } elsif ($seq =~ /^CGA/){ $seq = substr ($seq,2,length($seq)-2); $qual = substr ($qual,2,length($qual)-2); ++$CGA; ++$rrbs_trimmed_start; $nd = 1; } } } ### directional read unless ($nd == 1){ if (length $seq >= 2 and length$seq < $quality_trimmed_seq_length){ $seq = substr ($seq,0,length($seq)-2); $qual = substr ($qual,0,length($qual)-2); ++$rrbs_trimmed; } } ### Shortening all sequences by 1 bp on the 3' end if ($trim){ $seq = substr($seq,0,length($seq)-1); $qual = substr($qual,0,length($qual)-1); } ### PRINTING (POTENTIALLY TRIMMED) SEQUENCE if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later) print OUT "$l1$seq\n$l3$qual\n"; } else{ # single end if ($clip_r1){ if (length $seq > $clip_r1){ # sequences that are already too short won't be clipped again $seq = substr($seq,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence $qual = substr($qual,$clip_r1); } } if ($three_prime_clip_r1){ if (length $seq > $three_prime_clip_r1){ # sequences that are already too short won't be clipped again # warn "seq/qual before/after trimming:\n$seq\n$qual\n"; $seq = substr($seq,0,(length($seq) - $three_prime_clip_r1)); # starting after the sequences to be trimmed until the end of the sequence $qual = substr($qual,0,(length($qual) - $three_prime_clip_r1 )); # warn "$seq\n$qual\n"; } } if (length $seq < $length_cutoff){ ++$too_short; next; } else{ print OUT "$l1$seq\n$l3$qual\n"; } } } print REPORT "\n"; while (<ERROR>){ warn $_; print REPORT $_; } close IN or die "Unable to close IN filehandle: $!"; close QUAL or die "Unable to close QUAL filehandle: $!"; close TRIM or die "Unable to close TRIM filehandle: $!"; close OUT or die "Unable to close OUT filehandle: $!"; } else{ ### optionally using 2 different adapters for read 1 and read 2 if ($validate and $a2){ ### Figure out whether current file counts as read 1 or read 2 of paired-end files if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$adapter' from file $filename <<< \n"; sleep (3); $pid = open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $adapter $filename") or die "Failed to launch Cutadapt: $!"; } else{ # this is read 2 of a pair warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$a2' from file $filename <<< \n"; sleep (3); $pid = open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $a2 $filename") or die "Failed to launch Cutadapt: $!"; } } ### Using the same adapter for both read 1 and read 2 else{ warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$adapter' from file $filename <<< \n"; sleep (3); $pid = open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $adapter $filename") or die "Failed to launch Cutadapt: $!"; } close WRITER or die $!; # not needed while (1){ my $l1 = <TRIM>; my $seq = <TRIM>; # quality and/or adapter trimmed sequence my $l3 = <TRIM>; my $qual = <TRIM>; # print "$l1$seq\n$l3$qual\n"; last unless (defined $qual); # could be an empty string $count++; if ($count%10000000 == 0){ warn "$count sequences processed\n"; } chomp $seq; chomp $qual; ### Shortening all sequences by 1 bp on the 3' end if ($trim){ $seq = substr($seq,0,length($seq)-1); $qual = substr($qual,0,length($qual)-1); } ### PRINTING (POTENTIALLY TRIMMED) SEQUENCE if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later) print OUT "$l1$seq\n$l3$qual\n"; } else{ # single end if ($clip_r1){ if (length $seq > $clip_r1){ # sequences that are already too short won't be clipped again $seq = substr($seq,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence $qual = substr($qual,$clip_r1); } } if ($three_prime_clip_r1){ if (length $seq > $three_prime_clip_r1){ # sequences that are already too short won't be clipped again # warn "seq/qual before/after trimming:\n$seq\n$qual\n"; $seq = substr($seq,0,(length($seq) - $three_prime_clip_r1)); # starting after the sequences to be trimmed until the end of the sequence $qual = substr($qual,0,(length($qual) - $three_prime_clip_r1)); # warn "$seq\n$qual\n";sleep(1); } } if (length $seq < $length_cutoff){ ++$too_short; next; } else{ print OUT "$l1$seq\n$l3$qual\n"; } } } print REPORT "\n"; while (<ERROR>){ warn $_; print REPORT $_; } close TRIM or die "Unable to close TRIM filehandle: $!\n"; close ERROR or die "Unable to close ERROR filehandle: $!\n"; close OUT or die "Unable to close OUT filehandle: $!\n"; } if ($rrbs){ unless ($keep){ # keeping the quality trimmed intermediate file for RRBS files # deleting temporary quality trimmed file my $deleted = unlink "$output_dir$temp"; if ($deleted){ warn "Successfully deleted temporary file $temp\n\n"; } else{ warn "Could not delete temporary file $temp"; } } } ### Wait and reap the child process (Cutadapt) so that it doesn't become a zombie process waitpid $pid, 0; unless ($? == 0){ die "\n\nCutadapt terminated with exit signal: '$?'.\nTerminating Trim Galore run, please check error message(s) to get an idea what went wrong...\n\n"; } warn "\nRUN STATISTICS FOR INPUT FILE: $filename\n"; print REPORT "\nRUN STATISTICS FOR INPUT FILE: $filename\n"; warn "="x 45,"\n"; print REPORT "="x 45,"\n"; warn "$count sequences processed in total\n"; print REPORT "$count sequences processed in total\n"; ### only reporting this separately if quality and adapter trimming were performed separately if ($rrbs){ my $percentage_shortened; if ($count){ $percentage_shortened = sprintf ("%.1f",$quality_trimmed/$count*100); warn "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n"; print REPORT "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n"; } else{ warn "Unable to determine percentage of reads that were shortened because 0 lines were processed\n\n"; print REPORT "Unable to determine percentage of reads that were shortened because 0 lines were processed\n\n"; } } my $percentage_too_short; if ($count){ $percentage_too_short = sprintf ("%.1f",$too_short/$count*100); } else{ $percentage_too_short = 'N/A'; } if ($validate){ ### only for paired-end files warn "The length threshold of paired-end sequences gets evaluated later on (in the validation step)\n"; } else{ ### Single-end file warn "Sequences removed because they became shorter than the length cutoff of $length_cutoff bp:\t$too_short ($percentage_too_short%)\n"; print REPORT "Sequences removed because they became shorter than the length cutoff of $length_cutoff bp:\t$too_short ($percentage_too_short%)\n"; } if ($rrbs){ my $percentage_rrbs_trimmed = sprintf ("%.1f",$rrbs_trimmed/$count*100); warn "RRBS reads trimmed by additional 2 bp when adapter contamination was detected:\t$rrbs_trimmed ($percentage_rrbs_trimmed%)\n"; print REPORT "RRBS reads trimmed by additional 2 bp when adapter contamination was detected:\t$rrbs_trimmed ($percentage_rrbs_trimmed%)\n"; } if ($non_directional){ my $percentage_rrbs_trimmed_at_start = sprintf ("%.1f",$rrbs_trimmed_start/$count*100); warn "RRBS reads trimmed by 2 bp at the start when read started with CAA ($CAA) or CGA ($CGA) in total:\t$rrbs_trimmed_start ($percentage_rrbs_trimmed_at_start%)\n"; print REPORT "RRBS reads trimmed by 2 bp at the start when read started with CAA ($CAA) or CGA ($CGA) in total:\t$rrbs_trimmed_start ($percentage_rrbs_trimmed_at_start%)\n"; } warn "\n"; print REPORT "\n"; ### RUNNING FASTQC unless we are dealing with paired-end files unless($validate){ if ($fastqc){ warn "\n >>> Now running FastQC on the data <<<\n\n"; sleep (5); if ($fastqc_args){ system ("$path_to_fastqc $fastqc_args $output_dir$output_filename"); } else{ system ("$path_to_fastqc $output_dir$output_filename"); } } } ### VALIDATE PAIRED-END FILES if ($validate){ ### Figure out whether current file counts as read 1 or read 2 of paired-end files if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair $file_1 = $output_filename; shift @filenames; # warn "This is read 1: $file_1\n\n"; } else{ # this is read 2 of a pair $file_2 = $output_filename; shift @filenames; # warn "This is read 2: $file_2\n\n"; } if ($file_1 and $file_2){ warn "Validate paired-end files $file_1 and $file_2\n"; sleep (1); my ($val_1,$val_2,$un_1,$un_2) = validate_paired_end_files($file_1,$file_2); ### RUNNING FASTQC if ($fastqc){ warn "\n >>> Now running FastQC on the validated data $val_1<<<\n\n"; sleep (3); if ($fastqc_args){ system ("$path_to_fastqc $fastqc_args $output_dir$val_1"); } else{ system ("$path_to_fastqc $output_dir$val_1"); } warn "\n >>> Now running FastQC on the validated data $val_2<<<\n\n"; sleep (3); if ($fastqc_args){ system ("$path_to_fastqc $fastqc_args $output_dir$val_2"); } else{ system ("$path_to_fastqc $output_dir$val_2"); } } warn "Deleting both intermediate output files $file_1 and $file_2\n"; unlink "$output_dir$file_1"; unlink "$output_dir$file_2"; warn "\n",'='x100,"\n\n"; sleep (3); $file_1 = undef; # setting file_1 and file_2 to undef once validation is completed $file_2 = undef; } } } sub validate_paired_end_files{ my $file_1 = shift; my $file_2 = shift; warn "file_1: $file_1, file_2: $file_2\n\n"; if ($file_1 =~ /\.gz$/){ open (IN1,"zcat $output_dir$file_1 |") or die "Couldn't read from file $file_1: $!\n"; } else{ open (IN1, "$output_dir$file_1") or die "Couldn't read from file $file_1: $!\n"; } if ($file_2 =~ /\.gz$/){ open (IN2,"zcat $output_dir$file_2 |") or die "Couldn't read from file $file_2: $!\n"; } else{ open (IN2, "$output_dir$file_2") or die "Couldn't read from file $file_2: $!\n"; } warn "\n>>>>> Now validing the length of the 2 paired-end infiles: $file_1 and $file_2 <<<<<\n"; sleep (3); my $out_1 = $file_1; my $out_2 = $file_2; if ($out_1 =~ /gz$/){ $out_1 =~ s/trimmed\.fq\.gz$/val_1.fq/; } else{ $out_1 =~ s/trimmed\.fq$/val_1.fq/; } if ($out_2 =~ /gz$/){ $out_2 =~ s/trimmed\.fq\.gz$/val_2.fq/; } else{ $out_2 =~ s/trimmed\.fq$/val_2.fq/; } if ($gzip){ if ($dont_gzip){ open (R1,'>',$output_dir.$out_1) or die "Couldn't write to $out_1 $!\n"; } else{ $out_1 .= '.gz'; open (R1,"| gzip -c - > ${output_dir}${out_1}") or die "Can't write to $out_1: $!\n"; } } else{ open (R1,'>',$output_dir.$out_1) or die "Couldn't write to $out_1 $!\n"; } if ($gzip){ if ($dont_gzip){ open (R2,'>',$output_dir.$out_2) or die "Couldn't write to $out_2 $!\n"; } else{ $out_2 .= '.gz'; open (R2,"| gzip -c - > ${output_dir}${out_2}") or die "Can't write to $out_2: $!\n"; } } else{ open (R2,'>',$output_dir.$out_2) or die "Couldn't write to $out_2 $!\n"; } warn "Writing validated paired-end read 1 reads to $out_1\n"; warn "Writing validated paired-end read 2 reads to $out_2\n\n"; my $unpaired_1; my $unpaired_2; if ($retain){ $unpaired_1 = $file_1; $unpaired_2 = $file_2; if ($unpaired_1 =~ /gz$/){ $unpaired_1 =~ s/trimmed\.fq\.gz$/unpaired_1.fq/; } else{ $unpaired_1 =~ s/trimmed\.fq$/unpaired_1.fq/; } if ($unpaired_2 =~ /gz$/){ $unpaired_2 =~ s/trimmed\.fq\.gz$/unpaired_2.fq/; } else{ $unpaired_2 =~ s/trimmed\.fq$/unpaired_2.fq/; } if ($gzip){ if ($dont_gzip){ open (UNPAIRED1,'>',$output_dir.$unpaired_1) or die "Couldn't write to $unpaired_1: $!\n"; } else{ $unpaired_1 .= '.gz'; open (UNPAIRED1,"| gzip -c - > ${output_dir}${unpaired_1}") or die "Can't write to $unpaired_1: $!\n"; } } else{ open (UNPAIRED1,'>',$output_dir.$unpaired_1) or die "Couldn't write to $unpaired_1: $!\n"; } if ($gzip){ if ($dont_gzip){ open (UNPAIRED2,'>',$output_dir.$unpaired_2) or die "Couldn't write to $unpaired_2: $!\n"; } else{ $unpaired_2 .= '.gz'; open (UNPAIRED2,"| gzip -c - > ${output_dir}${unpaired_2}") or die "Can't write to $unpaired_2: $!\n"; } } else{ open (UNPAIRED2,'>',$output_dir.$unpaired_2) or die "Couldn't write to $unpaired_2: $!\n"; } warn "Writing unpaired read 1 reads to $unpaired_1\n"; warn "Writing unpaired read 2 reads to $unpaired_2\n\n"; } my $sequence_pairs_removed = 0; my $read_1_printed = 0; my $read_2_printed = 0; my $count = 0; while (1){ my $id_1 = <IN1>; my $seq_1 = <IN1>; my $l3_1 = <IN1>; my $qual_1 = <IN1>; last unless ($id_1 and $seq_1 and $l3_1 and $qual_1); my $id_2 = <IN2>; my $seq_2 = <IN2>; my $l3_2 = <IN2>; my $qual_2 = <IN2>; last unless ($id_2 and $seq_2 and $l3_2 and $qual_2); ++$count; ## small check if the sequence files appear to be FastQ files if ($count == 1){ # performed just once if ($id_1 !~ /^\@/ or $l3_1 !~ /^\+/){ die "Input file doesn't seem to be in FastQ format at sequence $count\n"; } if ($id_2 !~ /^\@/ or $l3_2 !~ /^\+/){ die "Input file doesn't seem to be in FastQ format at sequence $count\n"; } } chomp $seq_1; chomp $seq_2; chomp $qual_1; chomp $qual_2; if ($clip_r1){ if (length $seq_1 > $clip_r1){ # sequences that are already too short won't be trimmed again $seq_1 = substr($seq_1,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence $qual_1 = substr($qual_1,$clip_r1); } } if ($clip_r2){ if (length $seq_2 > $clip_r2){ # sequences that are already too short won't be trimmed again $seq_2 = substr($seq_2,$clip_r2); # starting after the sequences to be trimmed until the end of the sequence $qual_2 = substr($qual_2,$clip_r2); } } if ($three_prime_clip_r1){ if (length $seq_1 > $three_prime_clip_r1){ # sequences that are already too short won't be clipped again $seq_1 = substr($seq_1,0,(length($seq_1) - $three_prime_clip_r1)); # starting after the sequences to be trimmed until the end of the sequence $qual_1 = substr($qual_1,0,(length($qual_1) - $three_prime_clip_r1)); } } if ($three_prime_clip_r2){ if (length $seq_2 > $three_prime_clip_r2){ # sequences that are already too short won't be clipped again $seq_2 = substr($seq_2,0,(length($seq_2) - $three_prime_clip_r2)); # starting after the sequences to be trimmed until the end of the sequence $qual_2 = substr($qual_2,0,(length($qual_2) - $three_prime_clip_r2)); } } ### making sure that the reads do have a sensible length if ( (length($seq_1) < $length_cutoff) or (length($seq_2) < $length_cutoff) ){ ++$sequence_pairs_removed; if ($retain){ # writing out single-end reads if they are longer than the cutoff if ( length($seq_1) >= $length_read_1){ # read 1 is long enough print UNPAIRED1 $id_1; print UNPAIRED1 "$seq_1\n"; print UNPAIRED1 $l3_1; print UNPAIRED1 "$qual_1\n"; ++$read_1_printed; } if ( length($seq_2) >= $length_read_2){ # read 2 is long enough print UNPAIRED2 $id_2; print UNPAIRED2 "$seq_2\n"; print UNPAIRED2 $l3_2; print UNPAIRED2 "$qual_2\n"; ++$read_2_printed; } } } else{ print R1 $id_1; print R1 "$seq_1\n"; print R1 $l3_1; print R1 "$qual_1\n"; print R2 $id_2; print R2 "$seq_2\n"; print R2 $l3_2; print R2 "$qual_2\n"; } } my $percentage; if ($count){ $percentage = sprintf("%.2f",$sequence_pairs_removed/$count*100); } else{ $percentage = 'N/A'; } warn "Total number of sequences analysed: $count\n\n"; warn "Number of sequence pairs removed because at least one read was shorter than the length cutoff ($length_cutoff bp): $sequence_pairs_removed ($percentage%)\n"; print REPORT "Total number of sequences analysed for the sequence pair length validation: $count\n\n"; print REPORT "Number of sequence pairs removed because at least one read was shorter than the length cutoff ($length_cutoff bp): $sequence_pairs_removed ($percentage%)\n"; if ($keep){ warn "Number of unpaired read 1 reads printed: $read_1_printed\n"; warn "Number of unpaired read 2 reads printed: $read_2_printed\n"; } close R1 or die $!; close R2 or die $!; if ($retain){ close UNPAIRED1 or die $!; close UNPAIRED2 or die $!; } warn "\n"; if ($retain){ return ($out_1,$out_2,$unpaired_1,$unpaired_2); } else{ return ($out_1,$out_2); } } sub file_sanity_check{ my $file = shift; open (SANITY,$file) or die "Failed to read from file '$file' to perform sanity check\n"; # just processing a single FastQ entry my $id = <SANITY>; my $seq = <SANITY>; my $three = <SANITY>; my $qual = <SANITY>; unless ($id and $seq and $three and $qual){ warn "Input file '$file' seems to be completely empty. Consider respecifying!\n\n"; } return; chomp $seq; # testing if the file is a colorspace file in which case we bail if ($seq =~ /\d+/){ die "File seems to be in SOLiD colorspace format which is not supported by Trim Galore (sequence is: '$seq')! Please use Cutadapt on colorspace files separately and check its documentation!\n\n"; } close SANITY; } sub process_commandline{ my $help; my $quality; my $adapter; my $adapter2; my $stringency; my $report; my $version; my $rrbs; my $length_cutoff; my $keep; my $fastqc; my $non_directional; my $phred33; my $phred64; my $fastqc_args; my $trim; my $gzip; my $validate; my $retain; my $length_read_1; my $length_read_2; my $error_rate; my $output_dir; my $no_report_file; my $suppress_warn; my $dont_gzip; my $clip_r1; my $clip_r2; my $three_prime_clip_r1; my $three_prime_clip_r2; my $nextera; my $small_rna; my $illumina; my $path_to_cutadapt; my $command_line = GetOptions ('help|man' => \$help, 'q|quality=i' => \$quality, 'a|adapter=s' => \$adapter, 'a2|adapter2=s' => \$adapter2, 'report' => \$report, 'version' => \$version, 'stringency=i' => \$stringency, 'fastqc' => \$fastqc, 'RRBS' => \$rrbs, 'keep' => \$keep, 'length=i' => \$length_cutoff, 'non_directional' => \$non_directional, 'phred33' => \$phred33, 'phred64' => \$phred64, 'fastqc_args=s' => \$fastqc_args, 'trim1' => \$trim, 'gzip' => \$gzip, 'paired_end' => \$validate, 'retain_unpaired' => \$retain, 'length_1|r1=i' => \$length_read_1, 'length_2|r2=i' => \$length_read_2, 'e|error_rate=s' => \$error_rate, 'o|output_dir=s' => \$output_dir, 'no_report_file' => \$no_report_file, 'suppress_warn' => \$suppress_warn, 'dont_gzip' => \$dont_gzip, 'clip_R1=i' => \$clip_r1, 'clip_R2=i' => \$clip_r2, 'three_prime_clip_R1=i' => \$three_prime_clip_r1, 'three_prime_clip_R2=i' => \$three_prime_clip_r2, 'illumina' => \$illumina, 'nextera' => \$nextera, 'small_rna' => \$small_rna, 'path_to_cutadapt=s' => \$path_to_cutadapt, ); ### EXIT ON ERROR if there were errors with any of the supplied options unless ($command_line){ die "Please respecify command line options\n"; } ### HELPFILE if ($help){ print_helpfile(); exit; } if ($version){ print << "VERSION"; Quality-/Adapter-/RRBS-Trimming (powered by Cutadapt) version $trimmer_version Last update: 06 05 2015 VERSION exit; } ### RRBS unless ($rrbs){ $rrbs = 0; } ### SUPRESS WARNINGS if (defined $suppress_warn){ $DOWARN = 0; } ### QUALITY SCORES my $phred_encoding; if ($phred33){ if ($phred64){ die "Please specify only a single quality encoding type (--phred33 or --phred64)\n\n"; } $phred_encoding = 33; } elsif ($phred64){ $phred_encoding = 64; } unless ($phred33 or $phred64){ warn "No quality encoding type selected. Assuming that the data provided uses Sanger encoded Phred scores (default)\n\n"; $phred_encoding = 33; sleep (1); } ### NON-DIRECTIONAL RRBS if ($non_directional){ unless ($rrbs){ die "Option '--non_directional' requires '--rrbs' to be specified as well. Please re-specify!\n"; } } else{ $non_directional = 0; } if ($fastqc_args){ $fastqc = 1; # specifying fastqc extra arguments automatically means that FastQC will be executed } else{ $fastqc_args = 0; } ### CUSTOM ERROR RATE if (defined $error_rate){ # make sure that the error rate is between 0 and 1 unless ($error_rate >= 0 and $error_rate <= 1){ die "Please specify an error rate between 0 and 1 (the default is 0.1)\n"; } } else{ $error_rate = 0.1; # (default) } if ($nextera and $small_rna or $nextera and $illumina or $illumina and $small_rna ){ die "You can't use several different adapter types at the same time. Make your choice or consider using -a and -a2\n\n"; } if (defined $adapter){ unless ($adapter =~ /^[ACTGNXactgnx]+$/){ die "Adapter sequence must contain DNA characters only (A,C,T,G or N)!\n"; } $adapter = uc$adapter; if ($illumina){ die "You can't supply an adapter sequence AND use the Illumina universal adapter sequence. Make your choice.\n\n"; } if ($nextera){ die "You can't supply an adapter sequence AND use the Nextera transposase adapter sequence. Make your choice.\n\n"; } if ($small_rna){ die "You can't supply an adapter sequence AND use the Illumina small RNA adapter sequence. Make your choice.\n\n"; } } if (defined $adapter2){ unless ($validate){ die "An optional adapter for read 2 of paired-end files requires '--paired' to be specified as well! Please re-specify\n"; } unless ($adapter2 =~ /^[ACTGNactgn]+$/){ die "Optional adapter 2 sequence must contain DNA characters only (A,C,T,G or N)!\n"; } $adapter2 = uc$adapter2; } ### LENGTH CUTOFF unless (defined $length_cutoff){ $length_cutoff = 20; } ### files are supposed to be paired-end files if ($validate){ # making sure that an even number of reads has been supplied unless ((scalar@ARGV)%2 == 0){ die "Please provide an even number of input files for paired-end FastQ trimming! Aborting ...\n"; } ## CUTOFF FOR VALIDATED READ-PAIRS if (defined $length_read_1 or defined $length_read_2){ unless ($retain){ die "Please specify --keep_unpaired to alter the unpaired single-end read length cut off(s)\n\n"; } if (defined $length_read_1){ unless ($length_read_1 >= 15 and $length_read_1 <= 100){ die "Please select a sensible cutoff for when a read pair should be filtered out due to short length (allowed range: 15-100 bp)\n\n"; } unless ($length_read_1 > $length_cutoff){ die "The single-end unpaired read length needs to be longer than the paired-end cut-off value ($length_cutoff bp)\n\n"; } } if (defined $length_read_2){ unless ($length_read_2 >= 15 and $length_read_2 <= 100){ die "Please select a sensible cutoff for when a read pair should be filtered out due to short length (allowed range: 15-100 bp)\n\n"; } unless ($length_read_2 > $length_cutoff){ die "The single-end unpaired read length needs to be longer than the paired-end cut-off value ($length_cutoff bp)\n\n"; } } } if ($retain){ $length_read_1 = 35 unless (defined $length_read_1); $length_read_2 = 35 unless (defined $length_read_2); } } unless ($no_report_file){ $no_report_file = 0; } ### OUTPUT DIR PATH if ($output_dir){ unless ($output_dir =~ /\/$/){ $output_dir =~ s/$/\//; } } else{ $output_dir = ''; } ### Trimming at the 5' end if (defined $clip_r2){ # trimming 5' bases of read 1 die "Clipping the 5' end of read 2 is only allowed for paired-end files (--paired)\n" unless ($validate); } if (defined $clip_r1){ # trimming 5' bases of read 1 unless ($clip_r1 > 0 and $clip_r1 < 100){ die "The 5' clipping value for read 1 should have a sensible value (> 0 and < read length)\n\n"; } } if (defined $clip_r2){ # trimming 5' bases of read 2 unless ($clip_r2 > 0 and $clip_r2 < 100){ die "The 5' clipping value for read 2 should have a sensible value (> 0 and < read length)\n\n"; } } ### Trimming at the 3' end if (defined $three_prime_clip_r1){ # trimming 3' bases of read 1 unless ($three_prime_clip_r1 > 0 and $three_prime_clip_r1 < 100){ die "The 3' clipping value for read 1 should have a sensible value (> 0 and < read length)\n\n"; } } if (defined $three_prime_clip_r2){ # trimming 3' bases of read 2 unless ($three_prime_clip_r2 > 0 and $three_prime_clip_r2 < 100){ die "The 3' clipping value for read 2 should have a sensible value (> 0 and < read length)\n\n"; } } return ($quality,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$adapter2,$error_rate,$output_dir,$no_report_file,$dont_gzip,$clip_r1,$clip_r2,$three_prime_clip_r1,$three_prime_clip_r2,$nextera,$small_rna,$path_to_cutadapt,$illumina); } sub print_helpfile{ print << "HELP"; USAGE: trim_galore [options] <filename(s)> -h/--help Print this help message and exits. -v/--version Print the version information and exits. -q/--quality <INT> Trim low-quality ends from reads in addition to adapter removal. For RRBS samples, quality trimming will be performed first, and adapter trimming is carried in a second round. Other files are quality and adapter trimmed in a single pass. The algorithm is the same as the one used by BWA (Subtract INT from all qualities; compute partial sums from all indices to the end of the sequence; cut sequence at the index at which the sum is minimal). Default Phred score: 20. --phred33 Instructs Cutadapt to use ASCII+33 quality scores as Phred scores (Sanger/Illumina 1.9+ encoding) for quality trimming. Default: ON. --phred64 Instructs Cutadapt to use ASCII+64 quality scores as Phred scores (Illumina 1.5 encoding) for quality trimming. --fastqc Run FastQC in the default mode on the FastQ file once trimming is complete. --fastqc_args "<ARGS>" Passes extra arguments to FastQC. If more than one argument is to be passed to FastQC they must be in the form "arg1 arg2 etc.". An example would be: --fastqc_args "--nogroup --outdir /home/". Passing extra arguments will automatically invoke FastQC, so --fastqc does not have to be specified separately. -a/--adapter <STRING> Adapter sequence to be trimmed. If not specified explicitly, Trim Galore will try to auto-detect whether the Illumina universal, Nextera transposase or Illumina small RNA adapter sequence was used. Also see '--illumina', '--nextera' and '--small_rna'. If no adapter can be detected within the first 1 million sequences of the first file specified Trim Galore defaults to '--illumina'. -a2/--adapter2 <STRING> Optional adapter sequence to be trimmed off read 2 of paired-end files. This option requires '--paired' to be specified as well. --illumina Adapter sequence to be trimmed is the first 13bp of the Illumina universal adapter 'AGATCGGAAGAGC' instead of the default auto-detection of adapter sequence. --nextera Adapter sequence to be trimmed is the first 12bp of the Nextera adapter 'CTGTCTCTTATA' instead of the default auto-detection of adapter sequence. --small_rna Adapter sequence to be trimmed is the first 12bp of the Illumina Small RNA Adapter 'ATGGAATTCTCG' instead of the default auto-detection of adapter sequence. --stringency <INT> Overlap with adapter sequence required to trim a sequence. Defaults to a very stringent setting of 1, i.e. even a single bp of overlapping sequence will be trimmed off from the 3' end of any read. -e <ERROR RATE> Maximum allowed error rate (no. of errors divided by the length of the matching region) (default: 0.1) --gzip Compress the output file with GZIP. If the input files are GZIP-compressed the output files will automatically be GZIP compressed as well. As of v0.2.8 the compression will take place on the fly. --dont_gzip Output files won't be compressed with GZIP. This option overrides --gzip. --length <INT> Discard reads that became shorter than length INT because of either quality or adapter trimming. A value of '0' effectively disables this behaviour. Default: 20 bp. For paired-end files, both reads of a read-pair need to be longer than <INT> bp to be printed out to validated paired-end files (see option --paired). If only one read became too short there is the possibility of keeping such unpaired single-end reads (see --retain_unpaired). Default pair-cutoff: 20 bp. -o/--output_dir <DIR> If specified all output will be written to this directory instead of the current directory. --no_report_file If specified no report file will be generated. --suppress_warn If specified any output to STDOUT or STDERR will be suppressed. --clip_R1 <int> Instructs Trim Galore to remove <int> bp from the 5' end of read 1 (or single-end reads). This may be useful if the qualities were very poor, or if there is some sort of unwanted bias at the 5' end. Default: OFF. --clip_R2 <int> Instructs Trim Galore to remove <int> bp from the 5' end of read 2 (paired-end reads only). This may be useful if the qualities were very poor, or if there is some sort of unwanted bias at the 5' end. For paired-end BS-Seq, it is recommended to remove the first few bp because the end-repair reaction may introduce a bias towards low methylation. Please refer to the M-bias plot section in the Bismark User Guide for some examples. Default: OFF. --three_prime_clip_R1 <int> Instructs Trim Galore to remove <int> bp from the 3' end of read 1 (or single-end reads) AFTER adapter/quality trimming has been performed. This may remove some unwanted bias from the 3' end that is not directly related to adapter sequence or basecall quality. Default: OFF. --three_prime_clip_R2 <int> Instructs Trim Galore to remove <int> bp from the 3' end of read 2 AFTER adapter/quality trimming has been performed. This may remove some unwanted bias from the 3' end that is not directly related to adapter sequence or basecall quality. Default: OFF. --path_to_cutadapt </path/to/cutadapt> You may use this option to specify a path to the Cutadapt executable, e.g. /my/home/cutadapt-1.7.1/bin/cutadapt. Else it is assumed that Cutadapt is in the PATH. RRBS-specific options (MspI digested material): --rrbs Specifies that the input file was an MspI digested RRBS sample (recognition site: CCGG). Sequences which were adapter-trimmed will have a further 2 bp removed from their 3' end. This is to avoid that the filled-in C close to the second MspI site in a sequence is used for methylation calls. Sequences which were merely trimmed because of poor quality will not be shortened further. --non_directional Selecting this option for non-directional RRBS libraries will screen quality-trimmed sequences for 'CAA' or 'CGA' at the start of the read and, if found, removes the first two basepairs. Like with the option '--rrbs' this avoids using cytosine positions that were filled-in during the end-repair step. '--non_directional' requires '--rrbs' to be specified as well. --keep Keep the quality trimmed intermediate file. Default: off, which means the temporary file is being deleted after adapter trimming. Only has an effect for RRBS samples since other FastQ files are not trimmed for poor qualities separately. Note for RRBS using MseI: If your DNA material was digested with MseI (recognition motif: TTAA) instead of MspI it is NOT necessary to specify --rrbs or --non_directional since virtually all reads should start with the sequence 'TAA', and this holds true for both directional and non-directional libraries. As the end-repair of 'TAA' restricted sites does not involve any cytosines it does not need to be treated especially. Instead, simply run Trim Galore! in the standard (i.e. non-RRBS) mode. Paired-end specific options: --paired This option performs length trimming of quality/adapter/RRBS trimmed reads for paired-end files. To pass the validation test, both sequences of a sequence pair are required to have a certain minimum length which is governed by the option --length (see above). If only one read passes this length threshold the other read can be rescued (see option --retain_unpaired). Using this option lets you discard too short read pairs without disturbing the sequence-by-sequence order of FastQ files which is required by many aligners. Trim Galore! expects paired-end files to be supplied in a pairwise fashion, e.g. file1_1.fq file1_2.fq SRR2_1.fq.gz SRR2_2.fq.gz ... . -t/--trim1 Trims 1 bp off every read from its 3' end. This may be needed for FastQ files that are to be aligned as paired-end data with Bowtie. This is because Bowtie (1) regards alignments like this: R1 ---------------------------> or this: -----------------------> R1 R2 <--------------------------- <----------------- R2 as invalid (whenever a start/end coordinate is contained within the other read). --retain_unpaired If only one of the two paired-end reads became too short, the longer read will be written to either '.unpaired_1.fq' or '.unpaired_2.fq' output files. The length cutoff for unpaired single-end reads is governed by the parameters -r1/--length_1 and -r2/--length_2. Default: OFF. -r1/--length_1 <INT> Unpaired single-end read length cutoff needed for read 1 to be written to '.unpaired_1.fq' output file. These reads may be mapped in single-end mode. Default: 35 bp. -r2/--length_2 <INT> Unpaired single-end read length cutoff needed for read 2 to be written to '.unpaired_2.fq' output file. These reads may be mapped in single-end mode. Default: 35 bp. Last modified on 06 May 2015. HELP exit; }