diff trim_galore @ 1:898db63d2e84 draft

upgrade to new version
author bgruening
date Wed, 17 Jul 2013 15:05:43 -0400
parents 3c1664caa8e3
children 2c1f0fe810f7
line wrap: on
line diff
--- a/trim_galore	Sat Jul 06 09:52:23 2013 -0400
+++ b/trim_galore	Wed Jul 17 15:05:43 2013 -0400
@@ -7,7 +7,7 @@
 use File::Basename;
 use Cwd;
 
-## This program is Copyright (C) 2012, Felix Krueger (felix.krueger@babraham.ac.uk)
+## This program is Copyright (C) 2012-13, Felix Krueger (felix.krueger@babraham.ac.uk)
 
 ## This program is free software: you can redistribute it and/or modify
 ## it under the terms of the GNU General Public License as published by
@@ -25,7 +25,7 @@
 
 
 ## this script is taking in FastQ sequences and trims them with Cutadapt
-## last modified on 18 10 2012
+## last modified on 10 April 2013
 
 ########################################################################
 
@@ -37,11 +37,17 @@
 ########################################################################
 
 
-my $trimmer_version = '0.2.5';
+my $trimmer_version = '0.2.8';
 my $DOWARN = 1; # print on screen warning and text by default
 BEGIN { $SIG{'__WARN__'} = sub { warn $_[0] if $DOWARN } };
 
-my ($cutoff,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$a2,$error_rate,$output_dir,$no_report_file) = process_commandline();
+my ($cutoff,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$a2,$error_rate,$output_dir,$no_report_file,$dont_gzip,$clip_r1,$clip_r2) = process_commandline();
+
+my @filenames = @ARGV;
+
+die "\nPlease provide the filename(s) of one or more FastQ file(s) to launch Trim Galore!\n
+USAGE:  'trim_galore [options] <filename(s)>'    or    'trim_galore --help'    for more options\n\n" unless (@filenames);
+
 
 ### SETTING DEFAULTS UNLESS THEY WERE SPECIFIED
 unless (defined $cutoff){
@@ -68,8 +74,6 @@
   $cutoff += 31;
 }
 
-my @filenames = @ARGV;
-
 my $file_1;
 my $file_2;
 
@@ -155,7 +159,7 @@
       }
 
       if ($length_read_2 == 35){
-	warn "Length cut-off for read 2: $length_read_2 b (default)\n";
+	warn "Length cut-off for read 2: $length_read_2 bb (default)\n";
 	print REPORT "Length cut-off for read 2: $length_read_2 bp (default)\n";
       }
       else{
@@ -180,6 +184,16 @@
     print REPORT "All sequences will be trimmed by 1 bp on their 3' end to avoid problems with invalid paired-end alignments with Bowtie 1\n";
   }
 
+  if ($clip_r1){
+    warn "All Read 1 sequences will be trimmed by $clip_r1 bp from their 5' end to avoid poor qualities or biases\n";
+    print REPORT "All Read 1 sequences will be trimmed by $clip_r1 bp from their 5' end to avoid poor qualities or biases\n";
+  }
+  if ($clip_r2){
+    warn "All Read 2 sequences will be trimmed by $clip_r2 bp from their 5' end to avoid poor qualities or biases (e.g. M-bias for BS-Seq applications)\n";
+    print REPORT "All Read 2 sequences will be trimmed by $clip_r2 bp from their 5' end to avoid poor qualities or biases (e.g. M-bias for BS-Seq applications)\n";
+  }
+
+
   if ($fastqc){
     warn "Running FastQC on the data once trimming has completed\n";
     print REPORT "Running FastQC on the data once trimming has completed\n";
@@ -195,9 +209,13 @@
     print REPORT "Keeping quality trimmed (but not yet adapter trimmed) intermediate FastQ file\n";
   }
 
+
   if ($gzip or $filename =~ /\.gz$/){
-    warn "Output file will be GZIP compressed\n";
-    print REPORT "Output file will be GZIP compressed\n";
+    $gzip = 1;
+    unless ($dont_gzip){
+      warn "Output file(s) will be GZIP compressed\n";
+      print REPORT "Output file will be GZIP compressed\n";
+    }
   }
 
   warn "\n";
@@ -265,9 +283,24 @@
     $output_filename =~ s/$/_trimmed.fq/;
   }
 
+  if ($gzip or $filename =~ /\.gz$/){
+    unless ($dont_gzip){
+      if ($validate){
+	open (OUT,'>',$output_dir.$output_filename) or die "Can't open $output_filename: $!\n"; # don't need to gzip intermediate file
+      }
+      else{
+	$output_filename .= '.gz';
+	open (OUT,"| gzip -c - > ${output_dir}${output_filename}") or die "Can't write to $output_filename: $!\n";
+      }
+    }
+    else{
+      open (OUT,'>',$output_dir.$output_filename) or die "Can't open $output_filename: $!\n"; # don't need to gzip intermediate file
+    }
+  }
+  else{
+    open (OUT,'>',$output_dir.$output_filename) or die "Can't open $output_filename: $!\n";
+  }
   warn "Writing final adapter and quality trimmed output to $output_filename\n\n";
-  open (OUT,'>',$output_dir.$output_filename) or die "Can't open $output_filename: $!\n";
-  sleep (2);
 
   my $count = 0;
   my $too_short = 0;
@@ -389,6 +422,12 @@
 	print OUT "$l1$seq\n$l3$qual\n";
       }
       else{ # single end
+
+	if ($clip_r1){
+	  $seq = substr($seq,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence
+	  $qual = substr($qual,$clip_r1);
+	}
+
 	if (length $seq < $length_cutoff){
 	  ++$too_short;
 	  next;
@@ -463,6 +502,12 @@
 	print OUT "$l1$seq\n$l3$qual\n";
       }
       else{ # single end
+
+	if ($clip_r1){
+	  $seq = substr($seq,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence
+	  $qual = substr($qual,$clip_r1);
+  	}
+
 	if (length $seq < $length_cutoff){
 	  ++$too_short;
 	  next;
@@ -535,24 +580,17 @@
   warn "\n";
   print REPORT "\n";
 
-  ### RUNNING FASTQC
-  if ($fastqc){
-    warn "\n  >>> Now running FastQC on the data <<<\n\n";
-    sleep (5);
-    if ($fastqc_args){
-      system ("$path_to_fastqc $fastqc_args $output_filename");
-    }
-    else{
-      system ("$path_to_fastqc $output_filename");
-    }
-  }
-
-  ### COMPRESSING OUTPUT FILE
-  unless ($validate){ # not gzipping intermediate files for paired-end files
-    if ($gzip or $filename =~ /\.gz$/){
-      warn "\n  >>> GZIP-ing the output file <<<\n\n";
-      system ("gzip -f $output_filename");
-      $output_filename = $output_filename.'.gz';
+  ### RUNNING FASTQC unless we are dealing with paired-end files
+  unless($validate){
+    if ($fastqc){
+      warn "\n  >>> Now running FastQC on the data <<<\n\n";
+      sleep (5);
+      if ($fastqc_args){
+	system ("$path_to_fastqc $fastqc_args $output_dir$output_filename");
+      }
+      else{
+	system ("$path_to_fastqc $output_dir$output_filename");
+      }
     }
   }
 
@@ -585,43 +623,24 @@
 	sleep (3);
 
 	if ($fastqc_args){
-	  system ("$path_to_fastqc $fastqc_args $val_1");
+	  system ("$path_to_fastqc $fastqc_args $output_dir$val_1");
 	}
 	else{
-	  system ("$path_to_fastqc $val_1");
+	  system ("$path_to_fastqc $output_dir$val_1");
 	}
 
 	warn "\n  >>> Now running FastQC on the validated data $val_2<<<\n\n";
 	sleep (3);
 
 	if ($fastqc_args){
-	  system ("$path_to_fastqc $fastqc_args $val_2");
+	  system ("$path_to_fastqc $fastqc_args $output_dir$val_2");
 	}
 	else{
-	  system ("$path_to_fastqc $val_2");
+	  system ("$path_to_fastqc $output_dir$val_2");
 	}
 	
       }
 
-      if ($gzip or $filename =~ /\.gz$/){
-		
-	# compressing main fastQ output files
-	warn "Compressing the validated output file $val_1 ...\n";
-	system ("gzip -f $val_1");
-	
-	warn "Compressing the validated output file $val_2 ...\n";
-	system ("gzip -f $val_2");
-
-
-	if ($retain){ # compressing unpaired reads
-	  warn "Compressing the unpaired read output $un_1 ...\n";
-	  system ("gzip -f $un_1");
-	
-	  warn "Compressing the unpaired read output $un_2 ...\n";
-	  system ("gzip -f $un_2");
-	}
-      }
-
       warn "Deleting both intermediate output files $file_1 and $file_2\n";
       unlink "$output_dir$file_1";
       unlink "$output_dir$file_2";
@@ -641,7 +660,7 @@
   my $file_1 = shift;
   my $file_2 = shift;
 
-  warn "file_1 $file_1 file_2 $file_2\n\n";
+  warn "file_1: $file_1, file_2: $file_2\n\n";
 
   if ($file_1 =~ /\.gz$/){
     open (IN1,"zcat $output_dir$file_1 |") or die "Couldn't read from file $file_1: $!\n";
@@ -677,8 +696,32 @@
     $out_2 =~ s/trimmed\.fq$/val_2.fq/;
   }
 
-  open (R1,'>',$output_dir.$out_1) or die "Couldn't write to $out_1 $!\n";
-  open (R2,'>',$output_dir.$out_2) or die "Couldn't write to $out_2 $!\n";
+  if ($gzip){
+    if ($dont_gzip){
+      open (R1,'>',$output_dir.$out_1) or die "Couldn't write to $out_1 $!\n";
+    }
+    else{
+      $out_1 .= '.gz';
+      open (R1,"| gzip -c - > ${output_dir}${out_1}") or die "Can't write to $out_1: $!\n";
+    }
+  }
+  else{
+    open (R1,'>',$output_dir.$out_1) or die "Couldn't write to $out_1 $!\n";
+  }
+
+  if ($gzip){
+    if ($dont_gzip){
+      open (R2,'>',$output_dir.$out_2) or die "Couldn't write to $out_2 $!\n";
+    }
+    else{
+      $out_2 .= '.gz';
+      open (R2,"| gzip -c - > ${output_dir}${out_2}") or die "Can't write to $out_2: $!\n";
+    }
+  }
+  else{
+    open (R2,'>',$output_dir.$out_2) or die "Couldn't write to $out_2 $!\n";
+  }
+
   warn "Writing validated paired-end read 1 reads to $out_1\n";
   warn "Writing validated paired-end read 2 reads to $out_2\n\n";
 
@@ -704,8 +747,31 @@
       $unpaired_2 =~ s/trimmed\.fq$/unpaired_2.fq/;
     }
 
-    open (UNPAIRED1,'>',$output_dir.$unpaired_1) or die "Couldn't write to $unpaired_1: $!\n";
-    open (UNPAIRED2,'>',$output_dir.$unpaired_2) or die "Couldn't write to $unpaired_2: $!\n";
+    if ($gzip){
+      if ($dont_gzip){
+	open (UNPAIRED1,'>',$output_dir.$unpaired_1) or die "Couldn't write to $unpaired_1: $!\n";
+      }
+      else{
+	$unpaired_1 .= '.gz';
+	open (UNPAIRED1,"| gzip -c - > ${output_dir}${unpaired_1}") or die "Can't write to $unpaired_1: $!\n";
+      }
+    }
+    else{
+      open (UNPAIRED1,'>',$output_dir.$unpaired_1) or die "Couldn't write to $unpaired_1: $!\n";
+    }
+
+    if ($gzip){
+      if ($dont_gzip){
+	open (UNPAIRED2,'>',$output_dir.$unpaired_2) or die "Couldn't write to $unpaired_2: $!\n";
+      }
+      else{
+	$unpaired_2 .= '.gz';
+	open (UNPAIRED2,"| gzip -c - > ${output_dir}${unpaired_2}") or die "Can't write to $unpaired_2: $!\n";
+      }
+    }
+    else{
+      open (UNPAIRED2,'>',$output_dir.$unpaired_2) or die "Couldn't write to $unpaired_2: $!\n";
+    }
 
     warn "Writing unpaired read 1 reads to $unpaired_1\n";
     warn "Writing unpaired read 2 reads to $unpaired_2\n\n";
@@ -733,7 +799,7 @@
     ++$count;
 
 
-    ## small check if the sequence files appear to FastQ files
+    ## small check if the sequence files appear to be FastQ files
     if ($count == 1){ # performed just once
       if ($id_1 !~ /^\@/ or $l3_1 !~ /^\+/){
 	die "Input file doesn't seem to be in FastQ format at sequence $count\n";
@@ -746,6 +812,14 @@
     chomp $seq_1;
     chomp $seq_2;
 
+    if ($clip_r1){
+      $seq_1 = substr($seq_1,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence
+      $qual_1 = substr($qual_1,$clip_r1);
+    }
+    if ($clip_r2){
+      $seq_2 = substr($seq_2,$clip_r2); # starting after the sequences to be trimmed until the end of the sequence
+      $qual_2 = substr($qual_2,$clip_r2);
+    }
 
     ### making sure that the reads do have a sensible length
     if ( (length($seq_1) < $length_cutoff) or (length($seq_2) < $length_cutoff) ){
@@ -795,6 +869,14 @@
     warn "Number of unpaired read 2 reads printed: $read_2_printed\n";
   }
 
+  close R1 or die $!;
+  close R2 or die $!;
+
+  if ($retain){
+    close UNPAIRED1 or die $!;
+    close UNPAIRED2 or die $!;
+  }
+
   warn "\n";
   if ($retain){
     return ($out_1,$out_2,$unpaired_1,$unpaired_2);
@@ -830,6 +912,9 @@
   my $output_dir;
   my $no_report_file;
   my $suppress_warn;
+  my $dont_gzip;
+  my $clip_r1;
+  my $clip_r2;
 
   my $command_line = GetOptions ('help|man' => \$help,
 				 'q|quality=i' => \$quality,
@@ -856,8 +941,12 @@
 				 'o|output_dir=s' => \$output_dir,
 				 'no_report_file' => \$no_report_file,
 				 'suppress_warn' => \$suppress_warn,
+				 'dont_gzip' => \$dont_gzip,
+				 'clip_R1=i' => \$clip_r1,
+				 'clip_R2=i' => \$clip_r2,
 				);
-  
+
+
   ### EXIT ON ERROR if there were errors with any of the supplied options
   unless ($command_line){
     die "Please respecify command line options\n";
@@ -879,7 +968,7 @@
                                (powered by Cutadapt)
                                   version $trimmer_version
 
-                             Last update: 18 10 2012
+                             Last update: 10 04 2013
 
 VERSION
     exit;
@@ -1011,7 +1100,25 @@
     $output_dir = '';
   }
 
-  return ($quality,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$adapter2,$error_rate,$output_dir,$no_report_file);
+  ### Trimming at the 5' end
+  if (defined $clip_r2){ # trimming 5' bases of read 1
+    die "Clipping the 5' end of read 2 is only allowed for paired-end files (--paired)\n" unless ($validate);
+  }
+
+  if (defined $clip_r1){ # trimming 5' bases of read 1
+    unless ($clip_r1 > 0 and $clip_r1 < 100){
+      die "The 5' clipping value for read 1 should have a sensible value (> 0 and < read length)\n\n";
+    }
+  }
+
+  if (defined $clip_r2){ # trimming 5' bases of read 2
+    unless ($clip_r2 > 0 and $clip_r2 < 100){
+      die "The 5' clipping value for read 2 should have a sensible value (> 0 and < read length)\n\n";
+    }
+  }
+
+
+  return ($quality,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$adapter2,$error_rate,$output_dir,$no_report_file,$dont_gzip,$clip_r1,$clip_r2);
 }
 
 
@@ -1065,8 +1172,11 @@
 -e <ERROR RATE>         Maximum allowed error rate (no. of errors divided by the length of the matching
                         region) (default: 0.1)
 
---gzip                  Compress the output file with gzip. If the input files are gzip-compressed
-                        the output files will be automatically gzip compressed as well.
+--gzip                  Compress the output file with GZIP. If the input files are GZIP-compressed
+                        the output files will automatically be GZIP compressed as well. As of v0.2.8 the
+                        compression will take place on the fly.
+
+--dont_gzip             Output files won't be compressed with GZIP. This option overrides --gzip.
 
 --length <INT>          Discard reads that became shorter than length INT because of either
                         quality or adapter trimming. A value of '0' effectively disables
@@ -1084,6 +1194,17 @@
 
 --suppress_warn         If specified any output to STDOUT or STDERR will be suppressed.
 
+--clip_R1 <int>         Instructs Trim Galore to remove <int> bp from the 5' end of read 1 (or single-end
+                        reads). This may be useful if the qualities were very poor, or if there is some
+                        sort of unwanted bias at the 5' end. Default: OFF.
+
+--clip_R2 <int>         Instructs Trim Galore to remove <int> bp from the 5' end of read 2 (paired-end reads
+                        only). This may be useful if the qualities were very poor, or if there is some sort
+                        of unwanted bias at the 5' end. For paired-end BS-Seq, it is recommended to remove
+                        the first few bp because the end-repair reaction may introduce a bias towards low
+                        methylation. Please refer to the M-bias plot section in the Bismark User Guide for
+                        some examples. Default: OFF.
+
 
 
 RRBS-specific options (MspI digested material):
@@ -1152,7 +1273,7 @@
                         Default: 35 bp.
 
 
-Last modified on 18 Oct 2012.
+Last modified on 15 July 2013.
 
 HELP
   exit;