trim_galore: trim_galore comparison

comparison trim_galore @ 1:898db63d2e84 draft

upgrade to new version

author	bgruening
date	Wed, 17 Jul 2013 15:05:43 -0400
parents	3c1664caa8e3
children	2c1f0fe810f7

comparison

equal deleted inserted replaced

-:3c1664caa8e3
+:898db63d2e84
 use IPC::Open3;
 use File::Spec;
 use File::Basename;
 use Cwd;
-## This program is Copyright (C) 2012, Felix Krueger (felix.krueger@babraham.ac.uk)
+## This program is Copyright (C) 2012-13, Felix Krueger (felix.krueger@babraham.ac.uk)
 ## This program is free software: you can redistribute it and/or modify
 ## it under the terms of the GNU General Public License as published by
 ## the Free Software Foundation, either version 3 of the License, or
 ## (at your option) any later version.
 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
 ## this script is taking in FastQ sequences and trims them with Cutadapt
-## last modified on 18 10 2012
+## last modified on 10 April 2013
 ########################################################################
 # change these paths if needed
 my $path_to_fastqc = 'fastqc';
 ########################################################################
-my $trimmer_version = '0.2.5';
+my $trimmer_version = '0.2.8';
 my $DOWARN = 1; # print on screen warning and text by default
 BEGIN { $SIG{'__WARN__'} = sub { warn $_[0] if $DOWARN } };
-my ($cutoff,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$a2,$error_rate,$output_dir,$no_report_file) = process_commandline();
+my ($cutoff,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$a2,$error_rate,$output_dir,$no_report_file,$dont_gzip,$clip_r1,$clip_r2) = process_commandline();
+my @filenames = @ARGV;
+die "\nPlease provide the filename(s) of one or more FastQ file(s) to launch Trim Galore!\n
+USAGE:  'trim_galore [options] <filename(s)>'    or    'trim_galore --help'    for more options\n\n" unless (@filenames);
 ### SETTING DEFAULTS UNLESS THEY WERE SPECIFIED
 unless (defined $cutoff){
 $cutoff = 20;
 }
 if ($phred_encoding == 64){
 $cutoff += 31;
 }
-my @filenames = @ARGV;
 my $file_1;
 my $file_2;
 foreach my $filename (@ARGV){
 trim ($filename);
 	warn "Length cut-off for read 1: $length_read_1 bp\n";
 	print REPORT "Length cut-off for read 1: $length_read_1 bp\n";
 }
 if ($length_read_2 == 35){
-	warn "Length cut-off for read 2: $length_read_2 b (default)\n";
+	warn "Length cut-off for read 2: $length_read_2 bb (default)\n";
 	print REPORT "Length cut-off for read 2: $length_read_2 bp (default)\n";
 }
 else{
 	warn "Length cut-off for read 2: $length_read_2 bp\n";
 	print REPORT "Length cut-off for read 2: $length_read_2 bp\n";
 if ($trim){
 warn "All sequences will be trimmed by 1 bp on their 3' end to avoid problems with invalid paired-end alignments with Bowtie 1\n";
 print REPORT "All sequences will be trimmed by 1 bp on their 3' end to avoid problems with invalid paired-end alignments with Bowtie 1\n";
 }
+if ($clip_r1){
+warn "All Read 1 sequences will be trimmed by $clip_r1 bp from their 5' end to avoid poor qualities or biases\n";
+print REPORT "All Read 1 sequences will be trimmed by $clip_r1 bp from their 5' end to avoid poor qualities or biases\n";
+}
+if ($clip_r2){
+warn "All Read 2 sequences will be trimmed by $clip_r2 bp from their 5' end to avoid poor qualities or biases (e.g. M-bias for BS-Seq applications)\n";
+print REPORT "All Read 2 sequences will be trimmed by $clip_r2 bp from their 5' end to avoid poor qualities or biases (e.g. M-bias for BS-Seq applications)\n";
+}
 if ($fastqc){
 warn "Running FastQC on the data once trimming has completed\n";
 print REPORT "Running FastQC on the data once trimming has completed\n";
 if ($fastqc_args){
 if ($keep and $rrbs){
 warn "Keeping quality trimmed (but not yet adapter trimmed) intermediate FastQ file\n";
 print REPORT "Keeping quality trimmed (but not yet adapter trimmed) intermediate FastQ file\n";
 }
 if ($gzip or $filename =~ /\.gz$/){
-warn "Output file will be GZIP compressed\n";
+$gzip = 1;
-print REPORT "Output file will be GZIP compressed\n";
+unless ($dont_gzip){
+warn "Output file(s) will be GZIP compressed\n";
+print REPORT "Output file will be GZIP compressed\n";
+}
 }
 warn "\n";
 print REPORT "\n";
 sleep (3);
 }
 else{
 $output_filename =~ s/$/_trimmed.fq/;
 }
+if ($gzip or $filename =~ /\.gz$/){
+unless ($dont_gzip){
+if ($validate){
+	open (OUT,'>',$output_dir.$output_filename) or die "Can't open $output_filename: $!\n"; # don't need to gzip intermediate file
+}
+else{
+	$output_filename .= '.gz';
+	open (OUT,"| gzip -c - > ${output_dir}${output_filename}") or die "Can't write to $output_filename: $!\n";
+}
+}
+else{
+open (OUT,'>',$output_dir.$output_filename) or die "Can't open $output_filename: $!\n"; # don't need to gzip intermediate file
+}
+}
+else{
+open (OUT,'>',$output_dir.$output_filename) or die "Can't open $output_filename: $!\n";
+}
 warn "Writing final adapter and quality trimmed output to $output_filename\n\n";
-open (OUT,'>',$output_dir.$output_filename) or die "Can't open $output_filename: $!\n";
-sleep (2);
 my $count = 0;
 my $too_short = 0;
 my $quality_trimmed = 0;
 my $rrbs_trimmed = 0;
 ### PRINTING (POTENTIALLY TRIMMED) SEQUENCE
 if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later)
 	print OUT "$l1$seq\n$l3$qual\n";
 }
 else{ # single end
+	if ($clip_r1){
+	  $seq = substr($seq,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence
+	  $qual = substr($qual,$clip_r1);
+	}
 	if (length $seq < $length_cutoff){
 	  ++$too_short;
 	  next;
 	}
 	else{
 ### PRINTING (POTENTIALLY TRIMMED) SEQUENCE
 if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later)
 	print OUT "$l1$seq\n$l3$qual\n";
 }
 else{ # single end
+	if ($clip_r1){
+	  $seq = substr($seq,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence
+	  $qual = substr($qual,$clip_r1);
+	}
 	if (length $seq < $length_cutoff){
 	  ++$too_short;
 	  next;
 	}
 	else{
 }
 warn "\n";
 print REPORT "\n";
-### RUNNING FASTQC
+### RUNNING FASTQC unless we are dealing with paired-end files
-if ($fastqc){
+unless($validate){
-warn "\n  >>> Now running FastQC on the data <<<\n\n";
+if ($fastqc){
-sleep (5);
+warn "\n  >>> Now running FastQC on the data <<<\n\n";
-if ($fastqc_args){
+sleep (5);
-system ("$path_to_fastqc $fastqc_args $output_filename");
+if ($fastqc_args){
-}
+	system ("$path_to_fastqc $fastqc_args $output_dir$output_filename");
-else{
+}
-system ("$path_to_fastqc $output_filename");
+else{
-}
+	system ("$path_to_fastqc $output_dir$output_filename");
 }
-### COMPRESSING OUTPUT FILE
-unless ($validate){ # not gzipping intermediate files for paired-end files
-if ($gzip or $filename =~ /\.gz$/){
-warn "\n  >>> GZIP-ing the output file <<<\n\n";
-system ("gzip -f $output_filename");
-$output_filename = $output_filename.'.gz';
 }
 }
 ### VALIDATE PAIRED-END FILES
 if ($validate){
 	warn "\n  >>> Now running FastQC on the validated data $val_1<<<\n\n";
 	sleep (3);
 	if ($fastqc_args){
-	  system ("$path_to_fastqc $fastqc_args $val_1");
+	  system ("$path_to_fastqc $fastqc_args $output_dir$val_1");
 	}
 	else{
-	  system ("$path_to_fastqc $val_1");
+	  system ("$path_to_fastqc $output_dir$val_1");
 	}
 	warn "\n  >>> Now running FastQC on the validated data $val_2<<<\n\n";
 	sleep (3);
 	if ($fastqc_args){
-	  system ("$path_to_fastqc $fastqc_args $val_2");
+	  system ("$path_to_fastqc $fastqc_args $output_dir$val_2");
 	}
 	else{
-	  system ("$path_to_fastqc $val_2");
+	  system ("$path_to_fastqc $output_dir$val_2");
 	}
-}
-if ($gzip or $filename =~ /\.gz$/){
-	# compressing main fastQ output files
-	warn "Compressing the validated output file $val_1 ...\n";
-	system ("gzip -f $val_1");
-	warn "Compressing the validated output file $val_2 ...\n";
-	system ("gzip -f $val_2");
-	if ($retain){ # compressing unpaired reads
-	  warn "Compressing the unpaired read output $un_1 ...\n";
-	  system ("gzip -f $un_1");
-	  warn "Compressing the unpaired read output $un_2 ...\n";
-	  system ("gzip -f $un_2");
-	}
 }
 warn "Deleting both intermediate output files $file_1 and $file_2\n";
 unlink "$output_dir$file_1";
 unlink "$output_dir$file_2";
 sub validate_paired_end_files{
 my $file_1 = shift;
 my $file_2 = shift;
-warn "file_1 $file_1 file_2 $file_2\n\n";
+warn "file_1: $file_1, file_2: $file_2\n\n";
 if ($file_1 =~ /\.gz$/){
 open (IN1,"zcat $output_dir$file_1 |") or die "Couldn't read from file $file_1: $!\n";
 }
 else{
 }
 else{
 $out_2 =~ s/trimmed\.fq$/val_2.fq/;
 }
-open (R1,'>',$output_dir.$out_1) or die "Couldn't write to $out_1 $!\n";
+if ($gzip){
-open (R2,'>',$output_dir.$out_2) or die "Couldn't write to $out_2 $!\n";
+if ($dont_gzip){
+open (R1,'>',$output_dir.$out_1) or die "Couldn't write to $out_1 $!\n";
+}
+else{
+$out_1 .= '.gz';
+open (R1,"| gzip -c - > ${output_dir}${out_1}") or die "Can't write to $out_1: $!\n";
+}
+}
+else{
+open (R1,'>',$output_dir.$out_1) or die "Couldn't write to $out_1 $!\n";
+}
+if ($gzip){
+if ($dont_gzip){
+open (R2,'>',$output_dir.$out_2) or die "Couldn't write to $out_2 $!\n";
+}
+else{
+$out_2 .= '.gz';
+open (R2,"| gzip -c - > ${output_dir}${out_2}") or die "Can't write to $out_2: $!\n";
+}
+}
+else{
+open (R2,'>',$output_dir.$out_2) or die "Couldn't write to $out_2 $!\n";
+}
 warn "Writing validated paired-end read 1 reads to $out_1\n";
 warn "Writing validated paired-end read 2 reads to $out_2\n\n";
 my $unpaired_1;
 my $unpaired_2;
 }
 else{
 $unpaired_2 =~ s/trimmed\.fq$/unpaired_2.fq/;
 }
-open (UNPAIRED1,'>',$output_dir.$unpaired_1) or die "Couldn't write to $unpaired_1: $!\n";
+if ($gzip){
-open (UNPAIRED2,'>',$output_dir.$unpaired_2) or die "Couldn't write to $unpaired_2: $!\n";
+if ($dont_gzip){
+	open (UNPAIRED1,'>',$output_dir.$unpaired_1) or die "Couldn't write to $unpaired_1: $!\n";
+}
+else{
+	$unpaired_1 .= '.gz';
+	open (UNPAIRED1,"| gzip -c - > ${output_dir}${unpaired_1}") or die "Can't write to $unpaired_1: $!\n";
+}
+}
+else{
+open (UNPAIRED1,'>',$output_dir.$unpaired_1) or die "Couldn't write to $unpaired_1: $!\n";
+}
+if ($gzip){
+if ($dont_gzip){
+	open (UNPAIRED2,'>',$output_dir.$unpaired_2) or die "Couldn't write to $unpaired_2: $!\n";
+}
+else{
+	$unpaired_2 .= '.gz';
+	open (UNPAIRED2,"| gzip -c - > ${output_dir}${unpaired_2}") or die "Can't write to $unpaired_2: $!\n";
+}
+}
+else{
+open (UNPAIRED2,'>',$output_dir.$unpaired_2) or die "Couldn't write to $unpaired_2: $!\n";
+}
 warn "Writing unpaired read 1 reads to $unpaired_1\n";
 warn "Writing unpaired read 2 reads to $unpaired_2\n\n";
 }
 last unless ($id_2 and $seq_2 and $l3_2 and $qual_2);
 ++$count;
-## small check if the sequence files appear to FastQ files
+## small check if the sequence files appear to be FastQ files
 if ($count == 1){ # performed just once
 if ($id_1 !~ /^\@/ or $l3_1 !~ /^\+/){
 	die "Input file doesn't seem to be in FastQ format at sequence $count\n";
 }
 if ($id_2 !~ /^\@/ or $l3_2 !~ /^\+/){
 }
 chomp $seq_1;
 chomp $seq_2;
+if ($clip_r1){
+$seq_1 = substr($seq_1,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence
+$qual_1 = substr($qual_1,$clip_r1);
+}
+if ($clip_r2){
+$seq_2 = substr($seq_2,$clip_r2); # starting after the sequences to be trimmed until the end of the sequence
+$qual_2 = substr($qual_2,$clip_r2);
+}
 ### making sure that the reads do have a sensible length
 if ( (length($seq_1) < $length_cutoff) or (length($seq_2) < $length_cutoff) ){
 ++$sequence_pairs_removed;
 if ($retain){ # writing out single-end reads if they are longer than the cutoff
 print REPORT "Number of sequence pairs removed because at least one read was shorter than the length cutoff ($length_cutoff bp): $sequence_pairs_removed ($percentage%)\n";
 if ($keep){
 warn "Number of unpaired read 1 reads printed: $read_1_printed\n";
 warn "Number of unpaired read 2 reads printed: $read_2_printed\n";
+}
+close R1 or die $!;
+close R2 or die $!;
+if ($retain){
+close UNPAIRED1 or die $!;
+close UNPAIRED2 or die $!;
 }
 warn "\n";
 if ($retain){
 return ($out_1,$out_2,$unpaired_1,$unpaired_2);
 my $length_read_2;
 my $error_rate;
 my $output_dir;
 my $no_report_file;
 my $suppress_warn;
+my $dont_gzip;
+my $clip_r1;
+my $clip_r2;
 my $command_line = GetOptions ('help|man' => \$help,
 				 'q|quality=i' => \$quality,
 				 'a|adapter=s' => \$adapter,
 				 'a2|adapter2=s' => \$adapter2,
 				 'length_2|r2=i' => \$length_read_2,
 				 'e|error_rate=s' => \$error_rate,
 				 'o|output_dir=s' => \$output_dir,
 				 'no_report_file' => \$no_report_file,
 				 'suppress_warn' => \$suppress_warn,
+				 'dont_gzip' => \$dont_gzip,
+				 'clip_R1=i' => \$clip_r1,
+				 'clip_R2=i' => \$clip_r2,
 				);
 ### EXIT ON ERROR if there were errors with any of the supplied options
 unless ($command_line){
 die "Please respecify command line options\n";
 }
 Quality-/Adapter-/RRBS-Trimming
 (powered by Cutadapt)
 version $trimmer_version
-Last update: 18 10 2012
+Last update: 10 04 2013
 VERSION
 exit;
 }
 }
 else{
 $output_dir = '';
 }
-return ($quality,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$adapter2,$error_rate,$output_dir,$no_report_file);
+### Trimming at the 5' end
+if (defined $clip_r2){ # trimming 5' bases of read 1
+die "Clipping the 5' end of read 2 is only allowed for paired-end files (--paired)\n" unless ($validate);
+}
+if (defined $clip_r1){ # trimming 5' bases of read 1
+unless ($clip_r1 > 0 and $clip_r1 < 100){
+die "The 5' clipping value for read 1 should have a sensible value (> 0 and < read length)\n\n";
+}
+}
+if (defined $clip_r2){ # trimming 5' bases of read 2
+unless ($clip_r2 > 0 and $clip_r2 < 100){
+die "The 5' clipping value for read 2 should have a sensible value (> 0 and < read length)\n\n";
+}
+}
+return ($quality,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$adapter2,$error_rate,$output_dir,$no_report_file,$dont_gzip,$clip_r1,$clip_r2);
 }
 will be trimmed of the 3' end of any read.
 -e <ERROR RATE>         Maximum allowed error rate (no. of errors divided by the length of the matching
 region) (default: 0.1)
---gzip                  Compress the output file with gzip. If the input files are gzip-compressed
+--gzip                  Compress the output file with GZIP. If the input files are GZIP-compressed
-the output files will be automatically gzip compressed as well.
+the output files will automatically be GZIP compressed as well. As of v0.2.8 the
+compression will take place on the fly.
+--dont_gzip             Output files won't be compressed with GZIP. This option overrides --gzip.
 --length <INT>          Discard reads that became shorter than length INT because of either
 quality or adapter trimming. A value of '0' effectively disables
 this behaviour. Default: 20 bp.
 directory.
 --no_report_file        If specified no report file will be generated.
 --suppress_warn         If specified any output to STDOUT or STDERR will be suppressed.
+--clip_R1 <int>         Instructs Trim Galore to remove <int> bp from the 5' end of read 1 (or single-end
+reads). This may be useful if the qualities were very poor, or if there is some
+sort of unwanted bias at the 5' end. Default: OFF.
+--clip_R2 <int>         Instructs Trim Galore to remove <int> bp from the 5' end of read 2 (paired-end reads
+only). This may be useful if the qualities were very poor, or if there is some sort
+of unwanted bias at the 5' end. For paired-end BS-Seq, it is recommended to remove
+the first few bp because the end-repair reaction may introduce a bias towards low
+methylation. Please refer to the M-bias plot section in the Bismark User Guide for
+some examples. Default: OFF.
 RRBS-specific options (MspI digested material):
 -r2/--length_2 <INT>    Unpaired single-end read length cutoff needed for read 2 to be written to
 '.unpaired_2.fq' output file. These reads may be mapped in single-end mode.
 Default: 35 bp.
-Last modified on 18 Oct 2012.
+Last modified on 15 July 2013.
 HELP
 exit;
 }

Mercurial > repos > bgruening > trim_galore

comparison trim_galore @ 1:898db63d2e84 draft