Mercurial > repos > plus > archer
view archer.pl @ 3:3af9b7634b2d draft default tip
Uploaded
author | plus |
---|---|
date | Thu, 29 May 2014 02:32:55 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl use strict; use warnings; my($i, $j, $k); my $parameters = {}; sub usage { print "\nUsage: $0 \n\n\t "; print "REQUIRED \n\t "; print "-config <config_file> \n\n"; exit(1); } if(scalar(@ARGV) == 0){ usage(); } # Parse the Command Line &parse_command_line($parameters, @ARGV); # Log File my $log_file = $parameters->{config_file} . ".log"; unless ( open(LOG_FILE, ">$log_file") ) { print "Cannot open file \"$log_file\" to write to!!\n\n"; exit; } print LOG_FILE "config = $parameters->{config_file}\n"; # Time Stamp my $timestamp = localtime(time); print LOG_FILE $timestamp, "\n"; # Parse Config File my @samples = (); my $number_of_samples = &parse_config_file(\@samples, \$parameters); print LOG_FILE "directory = $parameters->{directory}\n\n"; my $files = {}; # Create Target Regions File $files->{target_regions_file} = $parameters->{directory} . "/target_regions.dat"; &create_target_regions_file(\$parameters, \$files, \*LOG_FILE); my @tags = (); my($reads, $tag); my($fastq_file_1, $fastq_file_2); my %proceed = (); my $alignment_string = ''; my $number_of_alignment_files = 0; my $cmd = ''; for($i = 0; $i < $number_of_samples; $i++){ # Define Tags &define_tags(\@samples, \@tags, \$reads, \$tag, \$fastq_file_1, \$fastq_file_2, \$parameters, \*LOG_FILE); # Check if FASTQ Files Exist and are Non-empty $proceed{$i} = &decide_to_proceed(\$reads, \$fastq_file_1, \$fastq_file_2); # Create String of FASTQ Files to be Aligned by bwa_enz &create_alignment_string($proceed{$i}, \$reads, \$number_of_alignment_files, \$alignment_string, \$fastq_file_1, \$fastq_file_2); } # Align the reads with bwa_enz # Would be better to align paired reads together?? $cmd = &align_reads(\$parameters, $number_of_alignment_files, $alignment_string); print LOG_FILE $cmd; system($cmd); for($i = 0; $i < $number_of_samples; $i++){ # Define Tags &define_tags(\@samples, \@tags, \$reads, \$tag, \$fastq_file_1, \$fastq_file_2, \$parameters, \*LOG_FILE); # Define Summary File Names &define_summary_file_names(\$tag, \$parameters, \$files); if ( $proceed{$i} == 1 ){ # Convert SAM -> BAM -> BED &define_alignments_file_names(\$reads, \@tags, \$tag, \$parameters, \$files); $cmd = &rename_sam_files(\$reads, \$files); $cmd .= &alignments(\$reads, \$fastq_file_1, \$fastq_file_2, \$parameters, \$files); print LOG_FILE $cmd; system($cmd); # De-duplicate the SAM File(s) &define_de_duplication_file_names(\$reads, \@tags, \$tag, \$parameters, \$files); $cmd = &de_duplication(\$reads, \$fastq_file_1, \$fastq_file_2, \$parameters, \$files); print LOG_FILE $cmd; system($cmd); # What if only reverse read exists? Don't want to de-duplicate? # Select On-/Off-Target Reads # Split marked files into two files &define_on_and_off_target_file_names(\$reads, \@tags, \$tag, \$parameters, \$files); $cmd = &select_on_and_off_target_reads(\$reads, \$parameters, \$files); print LOG_FILE $cmd; system($cmd); # Coverage and Start Sites &define_coverage_and_start_site_file_names(\$reads, \@tags, \$parameters, \$files); $cmd = &generate_coverage_and_start_sites(\$reads, \$parameters, \$files); print LOG_FILE $cmd; system($cmd); # Proceed through the rest of the pipeline using the on-target reads # Create Master Files - one line per read &define_master_files_file_names(\$reads, \@tags, \$parameters, \$files); $cmd = &generate_master_files(\$reads, \$parameters, \$files); print LOG_FILE $cmd; system($cmd); # Select Fusion Reads - do not count mapping to a 'novel', i.e., not in refseq, region as a fusion &define_fusion_reads_file_names(\$reads, \@tags, \$tag, \$parameters, \$files); $cmd = &select_fusion_reads(\$reads, \$parameters, \$files); print LOG_FILE $cmd; system($cmd); # Count Fusions &define_count_fusions_file_names(\$tag, \$parameters, \$files); $cmd = &count_fusions(\$reads, \$parameters, \$files); print LOG_FILE $cmd; system($cmd); # Flanking Sequences &define_flanking_sequences_file_names(\$tag, \$parameters, \$files); $cmd = &flanking_sequences(\$reads, \$fastq_file_1, \$fastq_file_2, \$parameters, \$files); print LOG_FILE $cmd; system($cmd); # BAM Dedup Files &define_bam_dedup_files_file_names(\$reads, \@tags, \$parameters, \$files); $cmd = &bam_dedup_files(\$reads, \$files); print LOG_FILE $cmd; system($cmd); # Consensus Sequences - Fusion and Splice #&define_consensus_sequences_file_names(\$tag, \$parameters, \$files); #$cmd = &consensus_sequences(\$reads, \$fastq_file_1, \$fastq_file_2, \$tag, \$parameters, \$files); #print LOG_FILE $cmd; #system($cmd); # Sort SAM Files &define_sort_sam_files_file_names(\$reads, \@tags, \$parameters, \$files); $cmd = &sort_sam_files(\$reads, \$files); print LOG_FILE $cmd; system($cmd); # On-target Stats &define_on_target_stats_file_names(\$tag, \$parameters, \$files); $cmd = &on_target_stats(\$reads, \$parameters, \$files); print LOG_FILE $cmd; system($cmd); # Total Molecule Counts &define_total_molecule_counts_file_names(\$reads, \@tags, \$tag, \$parameters, \$files); $cmd = &total_molecule_counts(\$reads, \$parameters, \$files); print LOG_FILE $cmd; system($cmd); # De-duplicated Molecule Counts &define_de_deduplicated_molecule_counts_file_names(\$reads, \@tags, \$tag, \$parameters, \$files); $cmd = &de_duplicated_molecule_counts(\$reads, \$parameters, \$files); print LOG_FILE $cmd; system($cmd); # All Molecule Counts &define_all_molecule_counts_file_names(\$tag, \$parameters, \$files); $cmd = &all_molecule_counts(\$parameters, \$files); print LOG_FILE $cmd; system($cmd); # QC Check &define_qc_check_file_names(\$tag, \$parameters, \$files); $cmd = &qc_check(\$parameters, \$files); print LOG_FILE $cmd; system($cmd); # Coverage Uniformity &define_coverage_uniformity_file_names(\$tag, \$parameters, \$files); $cmd = &coverage_uniformity(\$reads, \$parameters, \$files); print LOG_FILE $cmd; system($cmd); # Summary $cmd = &summary(\$tag, \$parameters, \$files); print LOG_FILE $cmd; system($cmd); # Clean Up $cmd = &clean_up(\$reads, \$parameters, \$files); print LOG_FILE $cmd; system($cmd); } else{ # Summary for Unprocessed Sample $cmd = &summary_for_unprocessed_sample(\$reads, \$fastq_file_1, \$fastq_file_2, \$parameters, \$files); print LOG_FILE $cmd; system($cmd); } # Time Stamp $timestamp = localtime(time); print LOG_FILE $timestamp, "\n"; } # Join Multiple Samples #$cmd = &join_multiple_samples(\$parameters); #print LOG_FILE $cmd; #system($cmd); close(LOG_FILE); exit; sub parse_command_line { my($parameters, @ARGV) = @_; my $next_arg; while(scalar @ARGV > 0){ $next_arg = shift(@ARGV); if($next_arg eq "-config"){ $parameters->{config_file} = shift (@ARGV); } } } sub parse_config_file { my $samples_ref = shift; my $parameters_ref = shift; my @values = (); my $count = 0; open( FILE, "< $$parameters_ref->{config_file}" ) or die "Can't open $$parameters_ref->{config_file} : $!"; while( <FILE> ) { chomp; if ( length($_) > 1 and $_ !~ /^\#/ ){ @values = (); @values = split(/=/, $_); if($values[0] eq 'sample'){ @$samples_ref[$count] = $values[1]; $count++; } else{ $$parameters_ref->{$values[0]} = $values[1]; } } } my $num_samples = scalar @$samples_ref; print "number of samples = $num_samples\n"; return $num_samples; } sub create_target_regions_file { my $parameters_ref = shift; my $files_ref = shift; my $file_handle_ref = shift; my $cmd = ''; if ( ($$parameters_ref->{control_regions_file} ne 'NULL') && ($$parameters_ref->{target_regions_file} ne 'NULL') ) { $cmd = "cat $$parameters_ref->{control_regions_file} $$parameters_ref->{target_regions_file} > $$files_ref->{target_regions_file}\n"; print $file_handle_ref $cmd; system($cmd); } elsif ( $$parameters_ref->{target_regions_file} ne 'NULL' ) { $$files_ref->{target_regions_file} = $$parameters_ref->{target_regions_file}; } elsif ( $$parameters_ref->{control_regions_file} ne 'NULL' ) { $$files_ref->{target_regions_file} = $$parameters_ref->{control_regions_file}; } else { # Create Target Regions File my $label; my $target_file = $$parameters_ref->{directory} . "/target_file.dat"; my $control_file = $$parameters_ref->{directory} . "/control_file.dat"; my $target_temp_outputfile_1 = $$parameters_ref->{directory} . "/target_temp_1.dat"; my $target_temp_outputfile_2 = $$parameters_ref->{directory} . "/target_temp_2.dat"; my $control_temp_outputfile_1 = $$parameters_ref->{directory} . "/control_temp_1.dat"; my $control_temp_outputfile_2 = $$parameters_ref->{directory} . "/control_temp_2.dat"; my $path_to_annotation_script = $$parameters_ref->{path} . "/archer/annotation/"; # Target Primers Fasta File if ( -e $$parameters_ref->{target_primers} ) { if ( -s $$parameters_ref->{target_primers} ) { $label = 'fusion'; $cmd = "$$parameters_ref->{path}/create_target_regions_file.pl -target $$parameters_ref->{target_primers} -label $label -refseq $$parameters_ref->{refseq_file} -gtf_file $$parameters_ref->{gtf_file} -path $path_to_annotation_script -t1 $target_temp_outputfile_1 -t2 $target_temp_outputfile_2 -o $target_file\n"; print $file_handle_ref $cmd; system($cmd); } } # Control Primers Fasta File if ( -e $$parameters_ref->{control_primers} ) { if ( -s $$parameters_ref->{control_primers} ) { $label = 'housekeeping'; $cmd = "$$parameters_ref->{path}/create_target_regions_file.pl -target $$parameters_ref->{control_primers} -label $label -refseq $$parameters_ref->{refseq_file} -gtf_file $$parameters_ref->{gtf_file} -path $path_to_annotation_script -t1 $control_temp_outputfile_1 -t2 $control_temp_outputfile_2 -o $control_file\n"; print $file_handle_ref $cmd; system($cmd); } } if ( -e $control_file ) { if ( -s $control_file ) { $cmd = "cp $control_file $$files_ref->{target_regions_file}\n"; if ( -e $target_file ) { if ( -s $target_file ) { $cmd .= "cat $target_file >> $$files_ref->{target_regions_file}\n"; } } print $file_handle_ref $cmd; system($cmd); } } elsif ( -e $target_file ) { if ( -s $target_file ) { $cmd = "cp $target_file $$files_ref->{target_regions_file}\n"; print $file_handle_ref $cmd; system($cmd); } } $cmd = ''; if ( -e $target_file ) { $cmd .= "rm $target_temp_outputfile_1\n"; $cmd .= "rm $target_temp_outputfile_2\n"; } if ( -e $control_file ) { $cmd .= "rm $control_temp_outputfile_1\n"; $cmd .= "rm $control_temp_outputfile_2\n"; } print $file_handle_ref $cmd; system($cmd); $cmd = ''; if ( -e $target_file ) { $cmd .= "rm $target_file\n"; } if ( -e $control_file ) { $cmd .= "rm $control_file\n"; } print $file_handle_ref $cmd; system($cmd); print $file_handle_ref "\n"; } } sub define_tags { my $samples_array_ref = shift; my $tags_array_ref = shift; my $reads_ref = shift; my $tag_ref = shift; my $fastq_file_1_ref = shift; my $fastq_file_2_ref = shift; my $parameters_ref = shift; my $file_handle_ref = shift; @$tags_array_ref = (); @$tags_array_ref = split(/\s+/, @$samples_array_ref[$i]); # Split samples on whitespace if( (scalar @$tags_array_ref) == 1 ){ $$reads_ref = 'single'; $$tag_ref = $tags[0]; $$fastq_file_1_ref = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".fastq"; } else{ $$reads_ref = 'paired'; $$tag_ref = @$tags_array_ref[0] . "_" . @$tags_array_ref[1]; $$fastq_file_1_ref = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".fastq"; $$fastq_file_2_ref = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".fastq"; } print $$tag_ref, "\n"; print $file_handle_ref $$tag_ref, "\n"; } sub decide_to_proceed { my $reads_ref = shift; my $fastq_file_1_ref = shift; my $fastq_file_2_ref = shift; my $proceed = 0; if ( $$reads_ref eq 'single' ) { if ( -e $$fastq_file_1_ref ) { if ( -s $$fastq_file_1_ref ) { $proceed = 1; } } else{ $$fastq_file_1_ref .= ".gz"; # See if fastq_file_1 exists in gzipped form if ( -e $$fastq_file_1_ref ) { if ( -s $$fastq_file_1_ref ) { $proceed = 1; } } } } if ( $$reads_ref eq 'paired' ) { if ( -e $$fastq_file_1_ref ) { if ( -s $$fastq_file_1_ref ) { if ( -e $$fastq_file_2_ref ) { if ( -s $$fastq_file_2_ref ) { $proceed = 1; } } else{ $$fastq_file_2_ref .= ".gz"; # See if fastq_file_2 exists in gzipped form if ( -e $$fastq_file_2_ref ) { if ( -s $$fastq_file_2_ref ) { $proceed = 1; } } } } } else{ $$fastq_file_1_ref .= ".gz"; # See if fastq_file_1 exists in gzipped form if ( -e $$fastq_file_1_ref ) { if ( -s $$fastq_file_1_ref ) { if ( -e $$fastq_file_2_ref ) { if ( -s $$fastq_file_2_ref ) { $proceed = 1; } } else{ $$fastq_file_2_ref .= ".gz"; # See if fastq_file_2 exists in gzipped form if ( -e $$fastq_file_2_ref ) { if ( -s $$fastq_file_2_ref ) { $proceed = 1; } } } } } } } return $proceed; } sub create_alignment_string { my $proceed_value = shift; my $reads_ref = shift; my $number_of_alignment_files_ref = shift; my $alignment_string_ref = shift; my $fastq_file_1_ref = shift; my $fastq_file_2_ref = shift; if ( $proceed_value == 1 ) { if ( $$number_of_alignment_files_ref == 0 ){ $$alignment_string_ref = $$fastq_file_1_ref; } else { $$alignment_string_ref .= " " . $$fastq_file_1_ref; } $$number_of_alignment_files_ref++; if ( $$reads_ref eq 'paired' ) { $$alignment_string_ref .= " " . $$fastq_file_2_ref; $$number_of_alignment_files_ref++; } } } sub align_reads { my $parameters_ref = shift; my $number_of_alignment_files_value = shift; my $alignment_string_value = shift; my $cmd_line = "echo Align Reads\n"; if ( $number_of_alignment_files_value > 0 ) { $cmd_line .= "bwa_enz mem -Q 0 -m -D $$parameters_ref->{directory} $$parameters_ref->{reference_file} $alignment_string_value\n"; } return $cmd_line; } sub rename_sam_files { my $reads_ref = shift; my $files_ref = shift; my $cmd_line = "echo Rename SAM Files\n"; $cmd_line .= "mv $$files_ref->{sam_file_1_orig} $$files_ref->{sam_file_1_full}\n"; if ( $$reads_ref eq 'paired' ) { $cmd_line .= "mv $$files_ref->{sam_file_2_orig} $$files_ref->{sam_file_2_full}\n"; } return $cmd_line; } sub alignments { my $reads_ref = shift; my $fastq_file_1_ref = shift; my $fastq_file_2_ref = shift; my $parameters_ref = shift; my $files_ref = shift; my $cmd_line = "echo Alignments\n"; # -S input is SAM # -b output is BAM # -h print header for the SAM output # $cmd_line .= "bwa mem $$parameters_ref->{reference_file} $$fastq_file_1_ref > $$files_ref->{sam_file_1_full}\n"; $cmd_line .= "samtools view -Shq 40 $$files_ref->{sam_file_1_full} > $$files_ref->{sam_file_1}\n"; $cmd_line .= "samtools view -bS $$files_ref->{sam_file_1} > $$files_ref->{bam_file_1}\n"; $cmd_line .= "bamToBed -i $$files_ref->{bam_file_1} > $$files_ref->{bed_file_1_orig}\n"; if ( $$reads_ref eq 'single' ) { $cmd_line .= "mv $$files_ref->{bed_file_1_orig} $$files_ref->{bed_file_combined}\n"; } elsif ( $$reads_ref eq 'paired' ) { # $cmd_line .= "bwa mem $$parameters_ref->{reference_file} $$fastq_file_2_ref > $$files_ref->{sam_file_2_full}\n"; $cmd_line .= "samtools view -Shq 40 $$files_ref->{sam_file_2_full} > $$files_ref->{sam_file_2}\n"; $cmd_line .= "samtools view -bS $$files_ref->{sam_file_2} > $$files_ref->{bam_file_2}\n"; $cmd_line .= "bamToBed -i $$files_ref->{bam_file_2} > $$files_ref->{bed_file_2_orig}\n"; $cmd_line .= "cat $$files_ref->{bed_file_1_orig} $$files_ref->{bed_file_2_orig} > $$files_ref->{bed_file_combined}\n"; $cmd_line .= "rm $$files_ref->{bed_file_1_orig}\n"; $cmd_line .= "rm $$files_ref->{bed_file_2_orig}\n"; } return $cmd_line; } sub de_duplication { my $reads_ref = shift; my $fastq_file_1_ref = shift; my $fastq_file_2_ref = shift; my $parameters_ref = shift; my $files_ref = shift; my $cmd_line = "echo De-duplication\n"; if ( $$reads_ref eq 'single' ) { $cmd_line .= "sort -k4,4 $$files_ref->{bed_file_combined}|$$parameters_ref->{path}/dedup_pipeline.sh -p $$parameters_ref->{path} -b /dev/stdin -f $$fastq_file_1_ref > $$files_ref->{dedup_file}\n"; } else{ $cmd_line .= "sort -k4,4 $$files_ref->{bed_file_combined}|$$parameters_ref->{path}/dedup_pipeline.sh -p $$parameters_ref->{path} -b /dev/stdin -f $$fastq_file_1_ref -2 $$fastq_file_2_ref > $$files_ref->{dedup_file}\n"; } $cmd_line .= "$$parameters_ref->{path}/de_dup_2_hash.pl -dedup $$files_ref->{dedup_file} -sam $$files_ref->{sam_file_1} -o $$files_ref->{sam_dedup_file_1}\n"; $cmd_line .= "$$parameters_ref->{path}/de_dup_2_hash.pl -dedup $$files_ref->{dedup_file} -sam $$files_ref->{sam_file_1_full} -o $$files_ref->{sam_dedup_file_1_full}\n"; if ( $$reads_ref eq 'paired' ) { $cmd_line .= "$$parameters_ref->{path}/de_dup_2_hash.pl -dedup $$files_ref->{dedup_file} -sam $$files_ref->{sam_file_2} -o $$files_ref->{sam_dedup_file_2}\n"; $cmd_line .= "$$parameters_ref->{path}/de_dup_2_hash.pl -dedup $$files_ref->{dedup_file} -sam $$files_ref->{sam_file_2_full} -o $$files_ref->{sam_dedup_file_2_full}\n"; } return $cmd_line; } sub select_on_and_off_target_reads { my $reads_ref = shift; my $parameters_ref = shift; my $files_ref = shift; my $cmd_line = "echo Select On- and Off-target Reads\n"; # Create BED file of target regions $cmd_line .= "$$parameters_ref->{path}/convert_target_regions_to_bed.pl -t $$files_ref->{target_regions_file} -o $$files_ref->{target_regions_bed_file}\n"; # All Reads # Create a master file from the SAM file # Join the master file # Select the appropriate segment for each read id and create a BED file - need to have only one entry for each read in the BED file # Convert BED files of reads to single points - do this so that only start of R2 and end of R1 are counted in overlapping with the target regions so that only the target region that overlaps with these end points will be counted as being hit - assumes that there are no overlapping regions in the target regions file # Get intersection of reads with target regions $cmd_line .= "$$parameters_ref->{path}/generate_master_file_without_annotation.pl -sam $$files_ref->{sam_file_1} -o $$files_ref->{full_master_prejoin_file_1}\n"; $cmd_line .= "$$parameters_ref->{path}/join_master_file.pl -master $$files_ref->{full_master_prejoin_file_1} -o $$files_ref->{full_master_file_1}\n"; $cmd_line .= "$$parameters_ref->{path}/convert_master_file_to_bed.pl -master $$files_ref->{full_master_file_1} -read $$reads_ref -tag 1 -o $$files_ref->{bed_file_1}\n"; $cmd_line .= "$$parameters_ref->{path}/convert_bed_to_single_points.pl -b $$files_ref->{bed_file_1} -read $$reads_ref -tag 1 -o $$files_ref->{bed_points_file_1}\n"; $cmd_line .= "intersectBed -a $$files_ref->{target_regions_bed_file} -b $$files_ref->{bed_points_file_1} -wa -wb > $$files_ref->{intersect_file_1}\n"; if ( $$reads_ref eq 'paired' ) { $cmd_line .= "$$parameters_ref->{path}/generate_master_file_without_annotation.pl -sam $$files_ref->{sam_file_2} -o $$files_ref->{full_master_prejoin_file_2}\n"; $cmd_line .= "$$parameters_ref->{path}/join_master_file.pl -master $$files_ref->{full_master_prejoin_file_2} -o $$files_ref->{full_master_file_2}\n"; $cmd_line .= "$$parameters_ref->{path}/convert_master_file_to_bed.pl -master $$files_ref->{full_master_file_2} -read $$reads_ref -tag 2 -o $$files_ref->{bed_file_2}\n"; $cmd_line .= "$$parameters_ref->{path}/convert_bed_to_single_points.pl -b $$files_ref->{bed_file_2} -read $$reads_ref -tag 2 -o $$files_ref->{bed_points_file_2}\n"; $cmd_line .= "intersectBed -a $$files_ref->{target_regions_bed_file} -b $$files_ref->{bed_points_file_2} -wa -wb > $$files_ref->{intersect_file_2}\n"; $cmd_line .= "cat $$files_ref->{intersect_file_1} $$files_ref->{intersect_file_2} > $$files_ref->{intersect_file_combined}\n"; } # Create SAM files of the on- and off-target reads. On-target files contain all reads in which at least one of R1/R2 is on-target. Off-target files contain the remaining reads. # On-/Off-target Alone Read 1 $cmd_line .= "$$parameters_ref->{path}/generate_on_target_sam_files_hash.pl -sam $$files_ref->{sam_file_1} -i $$files_ref->{intersect_file_1} -on $$files_ref->{sam_on_target_alone_file_1}\n"; $cmd_line .= "$$parameters_ref->{path}/mark_on_target_reads.pl -sam $$files_ref->{sam_file_1} -on $$files_ref->{sam_on_target_alone_file_1} -temp_dir $$parameters_ref->{directory} -o $$files_ref->{sam_file_1_marked}\n"; $cmd_line .= "grep -v 'ON_TARGET' $$files_ref->{sam_file_1_marked} > $$files_ref->{sam_off_target_alone_file_1}\n"; if ( $$reads_ref eq 'paired' ) { # On-/Off-target Alone Read 2 $cmd_line .= "$$parameters_ref->{path}/generate_on_target_sam_files_hash.pl -sam $$files_ref->{sam_file_2} -i $$files_ref->{intersect_file_2} -on $$files_ref->{sam_on_target_alone_file_2}\n"; $cmd_line .= "$$parameters_ref->{path}/mark_on_target_reads.pl -sam $$files_ref->{sam_file_2} -on $$files_ref->{sam_on_target_alone_file_2} -temp_dir $$parameters_ref->{directory} -o $$files_ref->{sam_file_2_marked}\n"; $cmd_line .= "grep -v 'ON_TARGET' $$files_ref->{sam_file_2_marked} > $$files_ref->{sam_off_target_alone_file_2}\n"; # On-/Off-target Either Read 1 $cmd_line .= "$$parameters_ref->{path}/generate_on_target_sam_files_hash.pl -sam $$files_ref->{sam_file_1} -i $$files_ref->{intersect_file_combined} -on $$files_ref->{sam_on_target_file_1}\n"; $cmd_line .= "$$parameters_ref->{path}/mark_on_target_reads.pl -sam $$files_ref->{sam_file_1} -on $$files_ref->{sam_on_target_file_1} -temp_dir $$parameters_ref->{directory} -o $$files_ref->{sam_file_1_marked}\n"; $cmd_line .= "grep -v 'ON_TARGET' $$files_ref->{sam_file_1_marked} > $$files_ref->{sam_off_target_file_1}\n"; # On-/Off-target Either Read 2 $cmd_line .= "$$parameters_ref->{path}/generate_on_target_sam_files_hash.pl -sam $$files_ref->{sam_file_2} -i $$files_ref->{intersect_file_combined} -on $$files_ref->{sam_on_target_file_2}\n"; $cmd_line .= "$$parameters_ref->{path}/mark_on_target_reads.pl -sam $$files_ref->{sam_file_2} -on $$files_ref->{sam_on_target_file_2} -temp_dir $$parameters_ref->{directory} -o $$files_ref->{sam_file_2_marked}\n"; $cmd_line .= "grep -v 'ON_TARGET' $$files_ref->{sam_file_2_marked} > $$files_ref->{sam_off_target_file_2}\n"; } # De-duplicated Reads $cmd_line .= "$$parameters_ref->{path}/generate_master_file_without_annotation.pl -sam $$files_ref->{sam_dedup_file_1} -o $$files_ref->{full_master_dedup_prejoin_file_1}\n"; $cmd_line .= "$$parameters_ref->{path}/join_master_file.pl -master $$files_ref->{full_master_dedup_prejoin_file_1} -o $$files_ref->{full_master_dedup_file_1}\n"; $cmd_line .= "$$parameters_ref->{path}/convert_master_file_to_bed.pl -master $$files_ref->{full_master_dedup_file_1} -read $$reads_ref -tag 1 -o $$files_ref->{bed_dedup_file_1}\n"; $cmd_line .= "$$parameters_ref->{path}/convert_bed_to_single_points.pl -b $$files_ref->{bed_dedup_file_1} -read $$reads_ref -tag 1 -o $$files_ref->{bed_points_dedup_file_1}\n"; $cmd_line .= "intersectBed -a $$files_ref->{target_regions_bed_file} -b $$files_ref->{bed_points_dedup_file_1} -wa -wb > $$files_ref->{intersect_dedup_file_1}\n"; if ( $$reads_ref eq 'paired' ) { $cmd_line .= "$$parameters_ref->{path}/generate_master_file_without_annotation.pl -sam $$files_ref->{sam_dedup_file_2} -o $$files_ref->{full_master_dedup_prejoin_file_2}\n"; $cmd_line .= "$$parameters_ref->{path}/join_master_file.pl -master $$files_ref->{full_master_dedup_prejoin_file_2} -o $$files_ref->{full_master_dedup_file_2}\n"; $cmd_line .= "$$parameters_ref->{path}/convert_master_file_to_bed.pl -master $$files_ref->{full_master_dedup_file_2} -read $$reads_ref -tag 2 -o $$files_ref->{bed_dedup_file_2}\n"; $cmd_line .= "$$parameters_ref->{path}/convert_bed_to_single_points.pl -b $$files_ref->{bed_dedup_file_2} -read $$reads_ref -tag 2 -o $$files_ref->{bed_points_dedup_file_2}\n"; $cmd_line .= "intersectBed -a $$files_ref->{target_regions_bed_file} -b $$files_ref->{bed_points_dedup_file_2} -wa -wb > $$files_ref->{intersect_dedup_file_2}\n"; $cmd_line .= "cat $$files_ref->{intersect_dedup_file_1} $$files_ref->{intersect_dedup_file_2} > $$files_ref->{intersect_dedup_file_combined}\n"; } # Create SAM files of the on- and off-target reads. On-target files contain all reads in which at least one of R1/R2 is on-target. Off-target files contain the remaining reads. # On-/Off-target Alone Read 1 $cmd_line .= "$$parameters_ref->{path}/generate_on_target_sam_files_hash.pl -sam $$files_ref->{sam_dedup_file_1} -i $$files_ref->{intersect_dedup_file_1} -on $$files_ref->{sam_dedup_on_target_alone_file_1}\n"; $cmd_line .= "$$parameters_ref->{path}/mark_on_target_reads.pl -sam $$files_ref->{sam_dedup_file_1} -on $$files_ref->{sam_dedup_on_target_alone_file_1} -temp_dir $$parameters_ref->{directory} -o $$files_ref->{sam_dedup_file_1_marked}\n"; $cmd_line .= "grep -v 'ON_TARGET' $$files_ref->{sam_dedup_file_1_marked} > $$files_ref->{sam_dedup_off_target_alone_file_1}\n"; if ( $$reads_ref eq 'paired' ) { # On-/Off-target Alone Read 2 $cmd_line .= "$$parameters_ref->{path}/generate_on_target_sam_files_hash.pl -sam $$files_ref->{sam_dedup_file_2} -i $$files_ref->{intersect_dedup_file_2} -on $$files_ref->{sam_dedup_on_target_alone_file_2}\n"; $cmd_line .= "$$parameters_ref->{path}/mark_on_target_reads.pl -sam $$files_ref->{sam_dedup_file_2} -on $$files_ref->{sam_dedup_on_target_alone_file_2} -temp_dir $$parameters_ref->{directory} -o $$files_ref->{sam_dedup_file_2_marked}\n"; $cmd_line .= "grep -v 'ON_TARGET' $$files_ref->{sam_dedup_file_2_marked} > $$files_ref->{sam_dedup_off_target_alone_file_2}\n"; # On-/Off-target Either Read 1 $cmd_line .= "$$parameters_ref->{path}/generate_on_target_sam_files_hash.pl -sam $$files_ref->{sam_dedup_file_1} -i $$files_ref->{intersect_dedup_file_combined} -on $$files_ref->{sam_dedup_on_target_file_1}\n"; $cmd_line .= "$$parameters_ref->{path}/mark_on_target_reads.pl -sam $$files_ref->{sam_dedup_file_1} -on $$files_ref->{sam_dedup_on_target_file_1} -temp_dir $$parameters_ref->{directory} -o $$files_ref->{sam_dedup_file_1_marked}\n"; $cmd_line .= "grep -v 'ON_TARGET' $$files_ref->{sam_dedup_file_1_marked} > $$files_ref->{sam_dedup_off_target_file_1}\n"; # On-/Off-target Either Read 2 $cmd_line .= "$$parameters_ref->{path}/generate_on_target_sam_files_hash.pl -sam $$files_ref->{sam_dedup_file_2} -i $$files_ref->{intersect_dedup_file_combined} -on $$files_ref->{sam_dedup_on_target_file_2}\n"; $cmd_line .= "$$parameters_ref->{path}/mark_on_target_reads.pl -sam $$files_ref->{sam_dedup_file_2} -on $$files_ref->{sam_dedup_on_target_file_2} -temp_dir $$parameters_ref->{directory} -o $$files_ref->{sam_dedup_file_2_marked}\n"; $cmd_line .= "grep -v 'ON_TARGET' $$files_ref->{sam_dedup_file_2_marked} > $$files_ref->{sam_dedup_off_target_file_2}\n"; } return $cmd_line; } sub generate_coverage_and_start_sites { my $reads_ref = shift; my $parameters_ref = shift; my $files_ref = shift; my $cmd_line = "echo Generate Coverage and Start Sites\n"; $cmd_line .= "bash $$parameters_ref->{path}/generateHistAndStartSiteInfo.sh $$files_ref->{sam_dedup_file_1} $$parameters_ref->{reference_file} $$parameters_ref->{reference_file_index} $$files_ref->{start_site_dedup_file_1} $$files_ref->{coverage_dedup_file_1} $$parameters_ref->{path}\n"; if ( $$reads_ref eq 'paired' ) { $cmd_line .= "bash $$parameters_ref->{path}/generateHistAndStartSiteInfo.sh $$files_ref->{sam_dedup_file_2} $$parameters_ref->{reference_file} $$parameters_ref->{reference_file_index} $$files_ref->{start_site_dedup_file_2} $$files_ref->{coverage_dedup_file_2} $$parameters_ref->{path}\n"; } return $cmd_line; } sub generate_master_files { my $reads_ref = shift; my $parameters_ref = shift; my $files_ref = shift; my $cmd_line = "echo Generate Master Files\n"; # Create a master file of all reads with one line per read if ( $$reads_ref eq 'single' ) { $cmd_line .= "$$parameters_ref->{path}/generate_master_file_without_annotation.pl -sam $$files_ref->{sam_dedup_on_target_alone_file_1} -o $$files_ref->{master_dedup_no_annotation_file_1}\n"; } else{ $cmd_line .= "$$parameters_ref->{path}/generate_master_file_without_annotation.pl -sam $$files_ref->{sam_dedup_on_target_file_1} -o $$files_ref->{master_dedup_no_annotation_file_1}\n"; } $cmd_line .= "python $$parameters_ref->{path}/archer/annotation/annotate.py --gtf_file $$parameters_ref->{gtf_file} --coordinate_file $$files_ref->{master_dedup_no_annotation_file_1} --outfile $$files_ref->{master_dedup_prejoin_file_1} --chromosome_indices 1,1 --coordinate_indices 5,6\n"; $cmd_line .= "$$parameters_ref->{path}/join_master_file.pl -master $$files_ref->{master_dedup_prejoin_file_1} -o $$files_ref->{master_dedup_file_1}\n"; if ( $$reads_ref eq 'paired' ) { $cmd_line .= "$$parameters_ref->{path}/generate_master_file_without_annotation.pl -sam $$files_ref->{sam_dedup_on_target_file_2} -o $$files_ref->{master_dedup_no_annotation_file_2}\n"; $cmd_line .= "python $$parameters_ref->{path}/archer/annotation/annotate.py --gtf_file $$parameters_ref->{gtf_file} --coordinate_file $$files_ref->{master_dedup_no_annotation_file_2} --outfile $$files_ref->{master_dedup_prejoin_file_2} --chromosome_indices 1,1 --coordinate_indices 5,6\n"; $cmd_line .= "$$parameters_ref->{path}/join_master_file.pl -master $$files_ref->{master_dedup_prejoin_file_2} -o $$files_ref->{master_dedup_file_2}\n"; } return $cmd_line; } sub select_fusion_reads { my $reads_ref = shift; my $parameters_ref = shift; my $files_ref = shift; my $cmd_line = "echo Select Fusion Reads\n"; $cmd_line .= "$$parameters_ref->{path}/select_fusion_reads.pl -master $$files_ref->{master_dedup_file_1} -tag 1 -o1 $$files_ref->{one_segment_reads_file_1} -os $$files_ref->{splice_reads_file_1} -of $$files_ref->{fusion_reads_file_1} -omf $$files_ref->{multi_fusion_reads_file_1}\n"; if ( $$reads_ref eq 'single' ) { $cmd_line .= "cp $$files_ref->{fusion_reads_file_1} $$files_ref->{fusion_reads_file}\n"; $cmd_line .= "cp $$files_ref->{splice_reads_file_1} $$files_ref->{splice_reads_file}\n"; } if ( $$reads_ref eq 'paired' ) { $cmd_line .= "$$parameters_ref->{path}/select_fusion_reads.pl -master $$files_ref->{master_dedup_file_2} -tag 2 -o1 $$files_ref->{one_segment_reads_file_2} -os $$files_ref->{splice_reads_file_2} -of $$files_ref->{fusion_reads_file_2} -omf $$files_ref->{multi_fusion_reads_file_2}\n"; $cmd_line .= "cat $$files_ref->{fusion_reads_file_1} $$files_ref->{fusion_reads_file_2} > $$files_ref->{fusion_reads_file}\n"; $cmd_line .= "cat $$files_ref->{splice_reads_file_1} $$files_ref->{splice_reads_file_2} > $$files_ref->{splice_reads_file}\n"; } return $cmd_line; } sub count_fusions { my $reads_ref = shift; my $parameters_ref = shift; my $files_ref = shift; my $cmd_line = "echo Count Fusions and Splice Events\n"; # Count the number of each type of fusion pair, i.e., Gene A Exon X with Gene B Exon Y, get the median value of the coordinate of each breakpoint, sort and output each type $cmd_line .= "$$parameters_ref->{path}/count_fusions.pl -t $$files_ref->{target_regions_file} -fr $$files_ref->{fusion_reads_file} -min 30 -gtf $$parameters_ref->{gtf_file} -read $$reads_ref -limit 10 -min_occ 5 -ob $$files_ref->{fusion_counts_bare_file} -o $$files_ref->{fusion_counts_file}\n"; # Count the number of each type of splice pair - Use default values for -limit and -min_occ so that all splices will be reported $cmd_line .= "$$parameters_ref->{path}/count_fusions.pl -t $$files_ref->{target_regions_file} -fr $$files_ref->{splice_reads_file} -min 30 -gtf $$parameters_ref->{gtf_file} -read $$reads_ref -ob $$files_ref->{splice_counts_bare_file} -o $$files_ref->{splice_counts_file}\n"; # Add splice evidence to fusion counts $cmd_line .= "$$parameters_ref->{path}/add_splice_to_fusion_counts.pl -fcb $$files_ref->{fusion_counts_bare_file} -scb $$files_ref->{splice_counts_bare_file} -o $$files_ref->{fusion_counts_with_splice_bare_file} -om $$files_ref->{fusion_counts_with_splice_bare_file_machine}\n"; return $cmd_line; } sub flanking_sequences { my $reads_ref = shift; my $fastq_file_1_ref = shift; my $fastq_file_2_ref = shift; my $parameters_ref = shift; my $files_ref = shift; my $cmd_line = "echo Flanking Sequences\n"; if ( $$reads_ref eq 'single' ) { $cmd_line .= "$$parameters_ref->{path}/flanking_sequences.pl -fcb $$files_ref->{fusion_counts_bare_file} -fr $$files_ref->{fusion_reads_file} -read $$reads_ref -fastq_1 $$fastq_file_1_ref -o $$files_ref->{flanking_sequences_file}\n"; $cmd_line .= "$$parameters_ref->{path}/flanking_sequences.pl -fcb $$files_ref->{splice_counts_bare_file} -fr $$files_ref->{splice_reads_file} -read $$reads_ref -fastq_1 $$fastq_file_1_ref -o $$files_ref->{flanking_splice_sequences_file}\n"; } else{ $cmd_line .= "$$parameters_ref->{path}/flanking_sequences.pl -fcb $$files_ref->{fusion_counts_bare_file} -fr $$files_ref->{fusion_reads_file} -read $$reads_ref -fastq_1 $$fastq_file_1_ref -fastq_2 $$fastq_file_2_ref -o $$files_ref->{flanking_sequences_file}\n"; $cmd_line .= "$$parameters_ref->{path}/flanking_sequences.pl -fcb $$files_ref->{splice_counts_bare_file} -fr $$files_ref->{splice_reads_file} -read $$reads_ref -fastq_1 $$fastq_file_1_ref -fastq_2 $$fastq_file_2_ref -o $$files_ref->{flanking_splice_sequences_file}\n"; } return $cmd_line; } sub bam_dedup_files { my $reads_ref = shift; my $files_ref = shift; my $cmd_line = "echo BAM Dedup Files\n"; # Make sorted de-dup BAM files $cmd_line .= "samtools view -bS $$files_ref->{sam_dedup_file_1} > $$files_ref->{bam_dedup_file_1}\n"; $cmd_line .= "samtools sort $$files_ref->{bam_dedup_file_1} $$files_ref->{bam_dedup_sorted_file_1_name}\n"; $cmd_line .= "samtools index $$files_ref->{bam_dedup_sorted_file_1}\n"; if ( $$reads_ref eq 'paired' ) { $cmd_line .= "samtools view -bS $$files_ref->{sam_dedup_file_2} > $$files_ref->{bam_dedup_file_2}\n"; $cmd_line .= "samtools sort $$files_ref->{bam_dedup_file_2} $$files_ref->{bam_dedup_sorted_file_2_name}\n"; $cmd_line .= "samtools index $$files_ref->{bam_dedup_sorted_file_2}\n"; } return $cmd_line; } sub consensus_sequences { my $reads_ref = shift; my $fastq_file_1_ref = shift; my $fastq_file_2_ref = shift; my $tag_ref = shift; my $parameters_ref = shift; my $files_ref = shift; my $cmd_line = "echo Consensus Sequences\n"; # Fusion and Splice Consensus Sequences if ( $$reads_ref eq 'single' ) { $cmd_line .= "bash $$parameters_ref->{path}/consensus_pipeline/batch_pipeline.sh -1 $$files_ref->{bam_dedup_sorted_file_1} -a $$fastq_file_1_ref -f $$parameters_ref->{reference_file} -r $$files_ref->{flanking_sequences_file} -d $$parameters_ref->{path}/consensus_pipeline/ -s $$files_ref->{consensus_fusion_std_out_file} -e $$files_ref->{consensus_fusion_std_err_file}\n"; $cmd_line .= "bash $$parameters_ref->{path}/consensus_pipeline/batch_pipeline.sh -1 $$files_ref->{bam_dedup_sorted_file_1} -a $$fastq_file_1_ref -f $$parameters_ref->{reference_file} -r $$files_ref->{flanking_splice_sequences_file} -d $$parameters_ref->{path}/consensus_pipeline/ -s $$files_ref->{consensus_splice_std_out_file} -e $$files_ref->{consensus_splice_std_err_file}\n"; } else{ $cmd_line .= "bash $$parameters_ref->{path}/consensus_pipeline/batch_pipeline.sh -1 $$files_ref->{bam_dedup_sorted_file_1} -2 $$files_ref->{bam_dedup_sorted_file_2} -a $$fastq_file_1_ref -b $$fastq_file_2_ref -f $$parameters_ref->{reference_file} -r $$files_ref->{flanking_sequences_file} -d $$parameters_ref->{path}/consensus_pipeline/ -s $$files_ref->{consensus_fusion_std_out_file} -e $$files_ref->{consensus_fusion_std_err_file}\n"; $cmd_line .= "bash $$parameters_ref->{path}/consensus_pipeline/batch_pipeline.sh -1 $$files_ref->{bam_dedup_sorted_file_1} -2 $$files_ref->{bam_dedup_sorted_file_2} -a $$fastq_file_1_ref -b $$fastq_file_2_ref -f $$parameters_ref->{reference_file} -r $$files_ref->{flanking_splice_sequences_file} -d $$parameters_ref->{path}/consensus_pipeline/ -s $$files_ref->{consensus_splice_std_out_file} -e $$files_ref->{consensus_splice_std_err_file}\n"; } # Pair Fusion Candidates with Splice Sequences $cmd_line .= "$$parameters_ref->{path}/pair_fusion_and_splice_sequences.pl -fc $$files_ref->{fusion_counts_bare_file} -sc $$files_ref->{splice_counts_bare_file} -tag $$tag_ref -o $$files_ref->{fusion_and_splice_consensus_file}\n"; return $cmd_line; } sub sort_sam_files { my $reads_ref = shift; my $files_ref = shift; my $cmd_line = "echo Sort SAM Files\n"; $cmd_line .= "sort -k1,1 $$files_ref->{sam_on_target_alone_file_1} > $$files_ref->{sam_on_target_alone_file_1_linux_sorted}\n"; $cmd_line .= "sort -k1,1 $$files_ref->{sam_off_target_alone_file_1} > $$files_ref->{sam_off_target_alone_file_1_linux_sorted}\n"; $cmd_line .= "sort -k1,1 $$files_ref->{sam_dedup_on_target_alone_file_1} > $$files_ref->{sam_dedup_on_target_alone_file_1_linux_sorted}\n"; $cmd_line .= "sort -k1,1 $$files_ref->{sam_dedup_off_target_alone_file_1} > $$files_ref->{sam_dedup_off_target_alone_file_1_linux_sorted}\n"; $cmd_line .= "samtools view -bS $$files_ref->{sam_file_1_full} > $$files_ref->{bam_file_1_full}\n"; $cmd_line .= "samtools sort -n $$files_ref->{bam_file_1_full} $$files_ref->{bam_file_1_full_prefix}\n"; $cmd_line .= "samtools view -h $$files_ref->{bam_file_1_full_sorted} > $$files_ref->{sam_file_1_full_sorted}\n"; $cmd_line .= "samtools view -bS $$files_ref->{sam_dedup_file_1_full} > $$files_ref->{bam_dedup_file_1_full}\n"; $cmd_line .= "samtools sort -n $$files_ref->{bam_dedup_file_1_full} $$files_ref->{bam_dedup_file_1_full_prefix}\n"; $cmd_line .= "samtools view -h $$files_ref->{bam_dedup_file_1_full_sorted} > $$files_ref->{sam_dedup_file_1_full_sorted}\n"; if ( $$reads_ref eq 'paired' ) { # Need to make this possible to be reverse only too $cmd_line .= "sort -k1,1 $$files_ref->{sam_on_target_file_1} > $$files_ref->{sam_on_target_file_1_linux_sorted}\n"; $cmd_line .= "sort -k1,1 $$files_ref->{sam_off_target_file_1} > $$files_ref->{sam_off_target_file_1_linux_sorted}\n"; $cmd_line .= "sort -k1,1 $$files_ref->{sam_dedup_on_target_file_1} > $$files_ref->{sam_dedup_on_target_file_1_linux_sorted}\n"; $cmd_line .= "sort -k1,1 $$files_ref->{sam_dedup_off_target_file_1} > $$files_ref->{sam_dedup_off_target_file_1_linux_sorted}\n"; $cmd_line .= "sort -k1,1 $$files_ref->{sam_on_target_alone_file_2} > $$files_ref->{sam_on_target_alone_file_2_linux_sorted}\n"; $cmd_line .= "sort -k1,1 $$files_ref->{sam_off_target_alone_file_2} > $$files_ref->{sam_off_target_alone_file_2_linux_sorted}\n"; $cmd_line .= "sort -k1,1 $$files_ref->{sam_dedup_on_target_alone_file_2} > $$files_ref->{sam_dedup_on_target_alone_file_2_linux_sorted}\n"; $cmd_line .= "sort -k1,1 $$files_ref->{sam_dedup_off_target_alone_file_2} > $$files_ref->{sam_dedup_off_target_alone_file_2_linux_sorted}\n"; $cmd_line .= "sort -k1,1 $$files_ref->{sam_on_target_file_2} > $$files_ref->{sam_on_target_file_2_linux_sorted}\n"; $cmd_line .= "sort -k1,1 $$files_ref->{sam_off_target_file_2} > $$files_ref->{sam_off_target_file_2_linux_sorted}\n"; $cmd_line .= "sort -k1,1 $$files_ref->{sam_dedup_on_target_file_2} > $$files_ref->{sam_dedup_on_target_file_2_linux_sorted}\n"; $cmd_line .= "sort -k1,1 $$files_ref->{sam_dedup_off_target_file_2} > $$files_ref->{sam_dedup_off_target_file_2_linux_sorted}\n"; $cmd_line .= "samtools view -bS $$files_ref->{sam_file_2_full} > $$files_ref->{bam_file_2_full}\n"; $cmd_line .= "samtools sort -n $$files_ref->{bam_file_2_full} $$files_ref->{bam_file_2_full_prefix}\n"; $cmd_line .= "samtools view -h $$files_ref->{bam_file_2_full_sorted} > $$files_ref->{sam_file_2_full_sorted}\n"; $cmd_line .= "samtools view -bS $$files_ref->{sam_dedup_file_2_full} > $$files_ref->{bam_dedup_file_2_full}\n"; $cmd_line .= "samtools sort -n $$files_ref->{bam_dedup_file_2_full} $$files_ref->{bam_dedup_file_2_full_prefix}\n"; $cmd_line .= "samtools view -h $$files_ref->{bam_dedup_file_2_full_sorted} > $$files_ref->{sam_dedup_file_2_full_sorted}\n"; } return $cmd_line; } sub on_target_stats { my $reads_ref = shift; my $parameters_ref = shift; my $files_ref = shift; my $cmd_line = "echo On-target Stats\n"; if ( -e $$files_ref->{target_regions_file} ) { if ( -s $$files_ref->{target_regions_file} ) { if ( $$reads_ref eq 'single' ) { # Need to make this possible to be reverse only too # Counts of on- and off-target reads $cmd_line .= "$$parameters_ref->{path}/on_target_counts.pl -on_alone_1 $$files_ref->{sam_on_target_alone_file_1_linux_sorted} -off_alone_1 $$files_ref->{sam_off_target_alone_file_1_linux_sorted} -o $$files_ref->{on_target_file}\n"; $cmd_line .= "$$parameters_ref->{path}/on_target_counts.pl -on_alone_1 $$files_ref->{sam_dedup_on_target_alone_file_1_linux_sorted} -off_alone_1 $$files_ref->{sam_dedup_off_target_alone_file_1_linux_sorted} -o $$files_ref->{on_target_dedup_file}\n"; # On-target Stats and Housekeeping Stats $cmd_line .= "$$parameters_ref->{path}/on_target_stats.pl -t $$files_ref->{target_regions_file} -i1 $$files_ref->{intersect_file_1} -o $$files_ref->{reads_per_exon_file} -oh $$files_ref->{housekeeping_file}\n"; $cmd_line .= "$$parameters_ref->{path}/on_target_stats.pl -t $$files_ref->{target_regions_file} -i1 $$files_ref->{intersect_dedup_file_1} -o $$files_ref->{reads_per_exon_dedup_file} -oh $$files_ref->{housekeeping_dedup_file} -om $$files_ref->{reads_per_exon_dedup_file_machine}\n"; } else{ # Counts of on- and off-target reads $cmd_line .= "$$parameters_ref->{path}/on_target_counts.pl -on_alone_1 $$files_ref->{sam_on_target_alone_file_1_linux_sorted} -on_alone_2 $$files_ref->{sam_on_target_alone_file_2_linux_sorted} -off_alone_1 $$files_ref->{sam_off_target_alone_file_1_linux_sorted} -off_alone_2 $$files_ref->{sam_off_target_alone_file_2_linux_sorted} -on_1 $$files_ref->{sam_on_target_file_1_linux_sorted} -on_2 $$files_ref->{sam_on_target_file_2_linux_sorted} -off_1 $$files_ref->{sam_off_target_file_1_linux_sorted} -off_2 $$files_ref->{sam_off_target_file_2_linux_sorted} -o $$files_ref->{on_target_file}\n"; $cmd_line .= "$$parameters_ref->{path}/on_target_counts.pl -on_alone_1 $$files_ref->{sam_dedup_on_target_alone_file_1_linux_sorted} -on_alone_2 $$files_ref->{sam_dedup_on_target_alone_file_2_linux_sorted} -off_alone_1 $$files_ref->{sam_dedup_off_target_alone_file_1_linux_sorted} -off_alone_2 $$files_ref->{sam_dedup_off_target_alone_file_2_linux_sorted} -on_1 $$files_ref->{sam_dedup_on_target_file_1_linux_sorted} -on_2 $$files_ref->{sam_dedup_on_target_file_2_linux_sorted} -off_1 $$files_ref->{sam_dedup_off_target_file_1_linux_sorted} -off_2 $$files_ref->{sam_dedup_off_target_file_2_linux_sorted} -o $$files_ref->{on_target_dedup_file}\n"; # On-target Stats and Housekeeping Stats $cmd_line .= "$$parameters_ref->{path}/on_target_stats.pl -t $$files_ref->{target_regions_file} -i1 $$files_ref->{intersect_file_1} -i2 $$files_ref->{intersect_file_2} -o $$files_ref->{reads_per_exon_file} -oh $$files_ref->{housekeeping_file}\n"; $cmd_line .= "$$parameters_ref->{path}/on_target_stats.pl -t $$files_ref->{target_regions_file} -i1 $$files_ref->{intersect_dedup_file_1} -i2 $$files_ref->{intersect_dedup_file_2} -o $$files_ref->{reads_per_exon_dedup_file} -oh $$files_ref->{housekeeping_dedup_file} -om $$files_ref->{reads_per_exon_dedup_file_machine}\n"; } } } return $cmd_line; } sub total_molecule_counts { my $reads_ref = shift; my $parameters_ref = shift; my $files_ref = shift; my $cmd_line = "echo Total Molecule Counts\n"; if ( $$reads_ref eq 'single' ) { $cmd_line .= "python $$parameters_ref->{path}/count_reads_and_alignments_v2.py -r1 $$files_ref->{sam_file_1_full_sorted} -o $$files_ref->{total_and_aligned_molecule_count_per_tag_file}\n"; } else { $cmd_line .= "python $$parameters_ref->{path}/count_reads_and_alignments_v2.py -r1 $$files_ref->{sam_file_1_full_sorted} -r2 $$files_ref->{sam_file_2_full_sorted} -o $$files_ref->{total_and_aligned_molecule_count_per_tag_file}\n"; } return $cmd_line; } sub de_duplicated_molecule_counts { my $reads_ref = shift; my $parameters_ref = shift; my $files_ref = shift; my $cmd_line = "echo De-duplicated Molecule Counts\n"; if ( $$reads_ref eq 'single' ) { $cmd_line .= "python $$parameters_ref->{path}/count_reads_and_alignments_v2.py -r1 $$files_ref->{sam_dedup_file_1_full_sorted} -o $$files_ref->{unique_and_aligned_molecule_count_per_tag_file}\n"; } else { $cmd_line .= "python $$parameters_ref->{path}/count_reads_and_alignments_v2.py -r1 $$files_ref->{sam_dedup_file_1_full_sorted} -r2 $$files_ref->{sam_dedup_file_2_full_sorted} -o $$files_ref->{unique_and_aligned_molecule_count_per_tag_file}\n"; } return $cmd_line; } sub all_molecule_counts { my $parameters_ref = shift; my $files_ref = shift; my $cmd_line = "echo All Molecule Counts\n"; $cmd_line .= "$$parameters_ref->{path}/counts_2.pl -tamc $$files_ref->{total_and_aligned_molecule_count_per_tag_file} -uamc $$files_ref->{unique_and_aligned_molecule_count_per_tag_file} -otd $$files_ref->{on_target_dedup_file} -ot $$files_ref->{on_target_file} -o $$files_ref->{counts_file} -om $$files_ref->{counts_file_machine}\n"; return $cmd_line; } sub qc_check { my $parameters_ref = shift; my $files_ref = shift; my $cmd_line = "echo QC Check\n"; $cmd_line .= "$$parameters_ref->{path}/qc_check.pl -hd $$files_ref->{housekeeping_dedup_file} -o $$files_ref->{qc_filter_file} -om $$files_ref->{qc_filter_file_machine}\n"; return $cmd_line; } sub coverage_uniformity { my $reads_ref = shift; my $parameters_ref = shift; my $files_ref = shift; my $cmd_line = "echo Coverage Uniformity\n"; $cmd_line .= "$$parameters_ref->{path}/coverage_uniformity.pl -hd $$files_ref->{housekeeping_dedup_file} -r $$reads_ref -o $$files_ref->{coverage_uniformity_file} -om $$files_ref->{coverage_uniformity_file_machine}\n"; return $cmd_line; } sub summary { my $tag_ref = shift; my $parameters_ref = shift; my $files_ref = shift; my $cmd_line = "echo Summary\n"; # my $sample_name = "'" . @$samples_array_ref[$sample_element_number] . "'"; $cmd_line .= "$$parameters_ref->{path}/summary.pl -s $$tag_ref -o $$files_ref->{summary_file} -om $$files_ref->{summary_file_machine}\n"; $cmd_line .= "cat $$files_ref->{qc_filter_file} >> $$files_ref->{summary_file}\n"; # $cmd_line .= "cat $$files_ref->{coverage_uniformity_file} >> $$files_ref->{summary_file}\n"; $cmd_line .= "cat $$files_ref->{counts_file} >> $$files_ref->{summary_file}\n"; if ( -e $$files_ref->{reads_per_exon_dedup_file} ) { if ( -s $$files_ref->{reads_per_exon_dedup_file} ) { $cmd_line .= "cat $$files_ref->{reads_per_exon_dedup_file} >> $$files_ref->{summary_file}\n"; } } $cmd_line .= "cat $$files_ref->{fusion_counts_with_splice_bare_file} >> $$files_ref->{summary_file}\n"; # $cmd_line .= "enscript -f Courier8 -p $$files_ref->{summary_file_ps} $$files_ref->{summary_file}\n"; # $cmd_line .= "ps2pdf $$files_ref->{summary_file_ps} $$files_ref->{summary_file_pdf}\n"; # Machine Readable Summary File $cmd_line .= "cat $$files_ref->{qc_filter_file_machine} >> $$files_ref->{summary_file_machine}\n"; # $cmd_line .= "cat $$files_ref->{coverage_uniformity_file_machine} >> $$files_ref->{summary_file_machine}\n"; $cmd_line .= "cat $$files_ref->{counts_file_machine} >> $$files_ref->{summary_file_machine}\n"; if ( -e $$files_ref->{reads_per_exon_dedup_file_machine} ) { if ( -s $$files_ref->{reads_per_exon_dedup_file_machine} ) { $cmd_line .= "cat $$files_ref->{reads_per_exon_dedup_file_machine} >> $$files_ref->{summary_file_machine}\n"; } } $cmd_line .= "cat $$files_ref->{fusion_counts_with_splice_bare_file_machine} >> $$files_ref->{summary_file_machine}\n"; return $cmd_line; } sub clean_up { my $reads_ref = shift; my $parameters_ref = shift; my $files_ref = shift; my $file; my $cmd_line = "echo Clean Up\n"; # Clean up Flanking Sequences Files $cmd_line .= "$$parameters_ref->{path}/clean_up_flanking_sequences.pl -f $$files_ref->{flanking_sequences_file}\n"; # Clean up Flanking Splice Sequences Files $cmd_line .= "$$parameters_ref->{path}/clean_up_flanking_sequences.pl -f $$files_ref->{flanking_splice_sequences_file}\n"; # Clean up $cmd_line .= "rm $$files_ref->{sam_file_1_full}\n"; $cmd_line .= "rm $$files_ref->{bam_file_1}\n"; $cmd_line .= "rm $$files_ref->{bed_file_1}\n"; $cmd_line .= "rm $$files_ref->{bed_file_combined}\n"; $cmd_line .= "rm $$files_ref->{sam_dedup_file_1}\n"; $cmd_line .= "rm $$files_ref->{sam_dedup_file_1_full}\n"; $cmd_line .= "rm $$files_ref->{bam_dedup_file_1}\n"; $cmd_line .= "rm $$files_ref->{full_master_prejoin_file_1}\n"; $cmd_line .= "rm $$files_ref->{full_master_file_1}\n"; $cmd_line .= "rm $$files_ref->{full_master_dedup_prejoin_file_1}\n"; $cmd_line .= "rm $$files_ref->{full_master_dedup_file_1}\n"; $cmd_line .= "rm $$files_ref->{bed_points_file_1}\n"; $cmd_line .= "rm $$files_ref->{bed_points_dedup_file_1}\n"; $cmd_line .= "rm $$files_ref->{intersect_file_1}\n"; $cmd_line .= "rm $$files_ref->{intersect_dedup_file_1}\n"; $cmd_line .= "rm $$files_ref->{sam_on_target_alone_file_1}\n"; $cmd_line .= "rm $$files_ref->{sam_off_target_alone_file_1}\n"; $cmd_line .= "rm $$files_ref->{master_dedup_no_annotation_file_1}\n"; $cmd_line .= "rm $$files_ref->{master_dedup_prejoin_file_1}\n"; $cmd_line .= "rm $$files_ref->{master_dedup_file_1}\n"; $cmd_line .= "rm $$files_ref->{fusion_reads_file}\n"; $cmd_line .= "rm $$files_ref->{fusion_counts_file}\n"; $cmd_line .= "rm $$files_ref->{fusion_counts_bare_file}\n"; # $file = $$files_ref->{flanking_sequences_file} . ".tmp~"; # $cmd_line .= "rm $file\n"; # $file = $$files_ref->{flanking_splice_sequences_file} . ".tmp~"; # $cmd_line .= "rm $file\n"; $cmd_line .= "rm $$files_ref->{splice_counts_file}\n"; $cmd_line .= "rm $$files_ref->{splice_counts_bare_file}\n"; $cmd_line .= "rm $$files_ref->{fusion_counts_with_splice_bare_file}\n"; $cmd_line .= "rm $$files_ref->{fusion_counts_with_splice_bare_file_machine}\n"; $cmd_line .= "rm $$files_ref->{splice_reads_file}\n"; $cmd_line .= "rm $$files_ref->{sam_on_target_alone_file_1_linux_sorted}\n"; $cmd_line .= "rm $$files_ref->{sam_off_target_alone_file_1_linux_sorted}\n"; $cmd_line .= "rm $$files_ref->{sam_dedup_on_target_alone_file_1_linux_sorted}\n"; $cmd_line .= "rm $$files_ref->{sam_dedup_off_target_alone_file_1_linux_sorted}\n"; $cmd_line .= "rm $$files_ref->{housekeeping_dedup_file}\n"; $cmd_line .= "rm $$files_ref->{on_target_dedup_file}\n"; $cmd_line .= "rm $$files_ref->{reads_per_exon_dedup_file}\n"; $cmd_line .= "rm $$files_ref->{reads_per_exon_dedup_file_machine}\n"; $cmd_line .= "rm $$files_ref->{housekeeping_file}\n"; $cmd_line .= "rm $$files_ref->{on_target_file}\n"; $cmd_line .= "rm $$files_ref->{reads_per_exon_file}\n"; $cmd_line .= "rm $$files_ref->{sam_file_1_marked}\n"; $cmd_line .= "rm $$files_ref->{sam_dedup_file_1_marked}\n"; $cmd_line .= "rm $$files_ref->{bam_file_1_full}\n"; $cmd_line .= "rm $$files_ref->{bam_file_1_full_sorted}\n"; $cmd_line .= "rm $$files_ref->{sam_file_1_full_sorted}\n"; # $cmd_line .= "rm $$files_ref->{total_and_aligned_molecule_count_file_1}\n"; $cmd_line .= "rm $$files_ref->{total_and_aligned_molecule_count_per_tag_file}\n"; # $cmd_line .= "rm $$files_ref->{unique_and_aligned_molecule_count_file_1}\n"; $cmd_line .= "rm $$files_ref->{unique_and_aligned_molecule_count_per_tag_file}\n"; $cmd_line .= "rm $$files_ref->{bam_dedup_file_1_full}\n"; $cmd_line .= "rm $$files_ref->{bam_dedup_file_1_full_sorted}\n"; $cmd_line .= "rm $$files_ref->{sam_dedup_file_1_full_sorted}\n"; $cmd_line .= "rm $$files_ref->{qc_filter_file}\n"; $cmd_line .= "rm $$files_ref->{qc_filter_file_machine}\n"; $cmd_line .= "rm $$files_ref->{coverage_uniformity_file}\n"; $cmd_line .= "rm $$files_ref->{coverage_uniformity_file_machine}\n"; $cmd_line .= "rm $$files_ref->{counts_file}\n"; $cmd_line .= "rm $$files_ref->{counts_file_machine}\n"; # $cmd_line .= "rm $$files_ref->{summary_file_ps}\n"; if ( $$reads_ref eq 'paired' ) { $cmd_line .= "rm $$files_ref->{sam_file_2_full}\n"; $cmd_line .= "rm $$files_ref->{bam_file_2}\n"; $cmd_line .= "rm $$files_ref->{bed_file_2}\n"; $cmd_line .= "rm $$files_ref->{sam_dedup_file_2}\n"; $cmd_line .= "rm $$files_ref->{sam_dedup_file_2_full}\n"; $cmd_line .= "rm $$files_ref->{bam_dedup_file_2}\n"; $cmd_line .= "rm $$files_ref->{full_master_prejoin_file_2}\n"; $cmd_line .= "rm $$files_ref->{full_master_file_2}\n"; $cmd_line .= "rm $$files_ref->{full_master_dedup_prejoin_file_2}\n"; $cmd_line .= "rm $$files_ref->{full_master_dedup_file_2}\n"; $cmd_line .= "rm $$files_ref->{bed_points_file_2}\n"; $cmd_line .= "rm $$files_ref->{bed_points_dedup_file_2}\n"; $cmd_line .= "rm $$files_ref->{intersect_file_2}\n"; $cmd_line .= "rm $$files_ref->{intersect_dedup_file_2}\n"; $cmd_line .= "rm $$files_ref->{intersect_file_combined}\n"; $cmd_line .= "rm $$files_ref->{intersect_dedup_file_combined}\n"; $cmd_line .= "rm $$files_ref->{sam_dedup_on_target_alone_file_1}\n"; $cmd_line .= "rm $$files_ref->{sam_dedup_off_target_alone_file_1}\n"; $cmd_line .= "rm $$files_ref->{sam_on_target_file_1}\n"; $cmd_line .= "rm $$files_ref->{sam_off_target_file_1}\n"; $cmd_line .= "rm $$files_ref->{sam_on_target_file_2}\n"; $cmd_line .= "rm $$files_ref->{sam_off_target_file_2}\n"; $cmd_line .= "rm $$files_ref->{sam_on_target_alone_file_2}\n"; $cmd_line .= "rm $$files_ref->{sam_off_target_alone_file_2}\n"; $cmd_line .= "rm $$files_ref->{sam_dedup_on_target_alone_file_2}\n"; $cmd_line .= "rm $$files_ref->{sam_dedup_off_target_alone_file_2}\n"; $cmd_line .= "rm $$files_ref->{master_dedup_no_annotation_file_2}\n"; $cmd_line .= "rm $$files_ref->{master_dedup_prejoin_file_2}\n"; $cmd_line .= "rm $$files_ref->{master_dedup_file_2}\n"; $cmd_line .= "rm $$files_ref->{sam_on_target_file_1_linux_sorted}\n"; $cmd_line .= "rm $$files_ref->{sam_off_target_file_1_linux_sorted}\n"; $cmd_line .= "rm $$files_ref->{sam_dedup_on_target_file_1_linux_sorted}\n"; $cmd_line .= "rm $$files_ref->{sam_dedup_off_target_file_1_linux_sorted}\n"; $cmd_line .= "rm $$files_ref->{sam_on_target_alone_file_2_linux_sorted}\n"; $cmd_line .= "rm $$files_ref->{sam_off_target_alone_file_2_linux_sorted}\n"; $cmd_line .= "rm $$files_ref->{sam_dedup_on_target_alone_file_2_linux_sorted}\n"; $cmd_line .= "rm $$files_ref->{sam_dedup_off_target_alone_file_2_linux_sorted}\n"; $cmd_line .= "rm $$files_ref->{sam_on_target_file_2_linux_sorted}\n"; $cmd_line .= "rm $$files_ref->{sam_off_target_file_2_linux_sorted}\n"; $cmd_line .= "rm $$files_ref->{sam_dedup_on_target_file_2_linux_sorted}\n"; $cmd_line .= "rm $$files_ref->{sam_dedup_off_target_file_2_linux_sorted}\n"; $cmd_line .= "rm $$files_ref->{sam_file_2_marked}\n"; $cmd_line .= "rm $$files_ref->{sam_dedup_file_2_marked}\n"; $cmd_line .= "rm $$files_ref->{bam_file_2_full}\n"; $cmd_line .= "rm $$files_ref->{bam_file_2_full_sorted}\n"; $cmd_line .= "rm $$files_ref->{sam_file_2_full_sorted}\n"; # $cmd_line .= "rm $$files_ref->{total_and_aligned_molecule_count_file_2}\n"; # $cmd_line .= "rm $$files_ref->{unique_and_aligned_molecule_count_file_2}\n"; $cmd_line .= "rm $$files_ref->{bam_dedup_file_2_full}\n"; $cmd_line .= "rm $$files_ref->{bam_dedup_file_2_full_sorted}\n"; $cmd_line .= "rm $$files_ref->{sam_dedup_file_2_full_sorted}\n"; } return $cmd_line; } sub summary_for_unprocessed_sample { my $reads_ref = shift; my $fastq_file_1_ref = shift; my $fastq_file_2_ref = shift; my $parameters_ref = shift; my $files_ref = shift; my $cmd_line = "echo Summary for Unprocessed Sample\n"; $cmd_line .= "$$parameters_ref->{path}/summary_for_unprocessed_samples.pl -r $$reads_ref -f1 $$fastq_file_1_ref -f2 $$fastq_file_2_ref -o $$files_ref->{summary_file} -om $$files_ref->{summary_file_machine}\n"; # $cmd_line .= "enscript -f Courier8 -p $$files_ref->{summary_file_ps} $$files_ref->{summary_file}\n"; # $cmd_line .= "ps2pdf $$files_ref->{summary_file_ps} $$files_ref->{summary_file_pdf}\n"; return $cmd_line; } sub join_multiple_samples { my $parameters_ref = shift; my $cmd_line = "echo Join Multiple Samples\n"; $cmd_line = "$$parameters_ref->{path}/join_multisample_output.pl -config $$parameters_ref->{config_file} -o $$parameters_ref->{outputfile}\n"; return $cmd_line; } #sub define_alignment_file_names { # my $reads_ref = shift; # my $tags_array_ref = shift; # my $parameters_ref = shift; # my $files_ref = shift; # $$files_ref->{sam_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".sam"; # $$files_ref->{sam_file_1_full} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".sam.full"; # if ( $$reads_ref eq 'paired' ) { # $$files_ref->{sam_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".sam"; # $$files_ref->{sam_file_2_full} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".sam.full"; # } #} sub define_alignments_file_names { my $reads_ref = shift; my $tags_array_ref = shift; my $tag_ref = shift; my $parameters_ref = shift; my $files_ref = shift; $$files_ref->{sam_file_1_orig} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".fastq.sam"; $$files_ref->{sam_file_1_full} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".sam.full"; $$files_ref->{sam_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".sam"; $$files_ref->{bam_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".bam"; $$files_ref->{bed_file_1_orig} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".bed.orig"; if ( $$reads_ref eq 'paired' ) { $$files_ref->{sam_file_2_orig} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".fastq.sam"; $$files_ref->{sam_file_2_full} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".sam.full"; $$files_ref->{sam_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".sam"; $$files_ref->{bam_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".bam"; $$files_ref->{bed_file_2_orig} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".bed.orig"; } $$files_ref->{bed_file_combined} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".combined.bed"; } sub define_de_duplication_file_names { my $reads_ref = shift; my $tags_array_ref = shift; my $tag_ref = shift; my $parameters_ref = shift; my $files_ref = shift; $$files_ref->{dedup_file} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".dedup_read_ids.dat"; $$files_ref->{sam_dedup_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".sam.dedup"; $$files_ref->{sam_dedup_file_1_full} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".dedup.sam.full"; if ( $$reads_ref eq 'paired' ) { $$files_ref->{sam_dedup_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".sam.dedup"; $$files_ref->{sam_dedup_file_2_full} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".dedup.sam.full"; } } sub define_on_and_off_target_file_names { my $reads_ref = shift; my $tags_array_ref = shift; my $tag_ref = shift; my $parameters_ref = shift; my $files_ref = shift; # File 1 $$files_ref->{target_regions_bed_file} = $$parameters_ref->{directory} . "/target_regions.bed"; $$files_ref->{full_master_prejoin_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".full.prejoin.master.dat"; $$files_ref->{full_master_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".full.master.dat"; $$files_ref->{bed_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".bed"; $$files_ref->{bed_points_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".points.bed"; $$files_ref->{intersect_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".intersect.dat"; $$files_ref->{sam_on_target_alone_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".on_target.alone.sam"; $$files_ref->{sam_file_1_marked} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".sam.marked"; $$files_ref->{sam_off_target_alone_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".off_target.alone.sam"; $$files_ref->{sam_on_target_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".on_target.sam"; $$files_ref->{sam_off_target_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".off_target.sam"; # File 1 Dedup $$files_ref->{full_master_dedup_prejoin_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".full.dedup.prejoin.master.dat"; $$files_ref->{full_master_dedup_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".full.dedup.master.dat"; $$files_ref->{bed_dedup_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".dedup.bed"; $$files_ref->{bed_points_dedup_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".dedup.points.bed"; $$files_ref->{intersect_dedup_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".dedup.intersect.dat"; $$files_ref->{sam_dedup_on_target_alone_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".dedup.on_target.alone.sam"; $$files_ref->{sam_dedup_file_1_marked} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".sam.dedup.marked"; $$files_ref->{sam_dedup_off_target_alone_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".dedup.off_target.alone.sam"; $$files_ref->{sam_dedup_on_target_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".dedup.on_target.sam"; $$files_ref->{sam_dedup_off_target_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".dedup.off_target.sam"; # File 2 if ( $$reads_ref eq 'paired' ) { $$files_ref->{full_master_prejoin_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".full.prejoin.master.dat"; $$files_ref->{full_master_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".full.master.dat"; $$files_ref->{bed_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".bed"; $$files_ref->{bed_points_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".points.bed"; $$files_ref->{intersect_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".intersect.dat"; $$files_ref->{sam_on_target_alone_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".on_target.alone.sam"; $$files_ref->{sam_file_2_marked} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".sam.marked"; $$files_ref->{sam_off_target_alone_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".off_target.alone.sam"; $$files_ref->{sam_on_target_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".on_target.sam"; $$files_ref->{sam_off_target_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".off_target.sam"; } # File 2 Dedup if ( $$reads_ref eq 'paired' ) { $$files_ref->{full_master_dedup_prejoin_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".full.dedup.prejoin.master.dat"; $$files_ref->{full_master_dedup_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".full.dedup.master.dat"; $$files_ref->{bed_dedup_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".dedup.bed"; $$files_ref->{bed_points_dedup_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".dedup.points.bed"; $$files_ref->{intersect_dedup_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".dedup.intersect.dat"; $$files_ref->{sam_dedup_on_target_alone_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".dedup.on_target.alone.sam"; $$files_ref->{sam_dedup_file_2_marked} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".sam.dedup.marked"; $$files_ref->{sam_dedup_off_target_alone_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".dedup.off_target.alone.sam"; $$files_ref->{sam_dedup_on_target_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".dedup.on_target.sam"; $$files_ref->{sam_dedup_off_target_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".dedup.off_target.sam"; } $$files_ref->{intersect_file_combined} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".intersect_combined.dat"; $$files_ref->{intersect_dedup_file_combined} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".dedup.intersect_combined.dat"; } sub define_coverage_and_start_site_file_names { my $reads_ref = shift; my $tags_array_ref = shift; my $parameters_ref = shift; my $files_ref = shift; $$files_ref->{start_site_dedup_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".dedup.start_site.bedgraph"; $$files_ref->{coverage_dedup_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".dedup.coverage.bedgraph"; if ( $$reads_ref eq 'paired' ) { $$files_ref->{start_site_dedup_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".dedup.start_site.bedgraph"; $$files_ref->{coverage_dedup_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".dedup.coverage.bedgraph"; } } sub define_master_files_file_names { my $reads_ref = shift; my $tags_array_ref = shift; my $parameters_ref = shift; my $files_ref = shift; $$files_ref->{master_dedup_no_annotation_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".dedup.no_annotation.master.dat"; $$files_ref->{master_dedup_prejoin_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".dedup.prejoin.master.dat"; $$files_ref->{master_dedup_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".dedup.master.dat"; if ( $$reads_ref eq 'paired' ) { $$files_ref->{master_dedup_no_annotation_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".dedup.no_annotation.master.dat"; $$files_ref->{master_dedup_prejoin_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".dedup.prejoin.master.dat"; $$files_ref->{master_dedup_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".dedup.master.dat"; } } sub define_fusion_reads_file_names { my $reads_ref = shift; my $tags_array_ref = shift; my $tag_ref = shift; my $parameters_ref = shift; my $files_ref = shift; $$files_ref->{one_segment_reads_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".one_segment_reads.dat"; $$files_ref->{splice_reads_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".splice_reads.dat"; $$files_ref->{fusion_reads_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".fusion_reads.dat"; $$files_ref->{multi_fusion_reads_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".multi_fusion_reads.dat"; if ( $$reads_ref eq 'paired' ) { $$files_ref->{one_segment_reads_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".one_segment_reads.dat"; $$files_ref->{splice_reads_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".splice_reads.dat"; $$files_ref->{fusion_reads_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".fusion_reads.dat"; $$files_ref->{multi_fusion_reads_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".multi_fusion_reads.dat"; } $$files_ref->{fusion_reads_file} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".fusion_reads.combined.dat"; $$files_ref->{splice_reads_file} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".splice_reads.combined.dat"; } sub define_count_fusions_file_names { my $tag_ref = shift; my $parameters_ref = shift; my $files_ref = shift; $$files_ref->{fusion_counts_bare_file} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".fusion_counts_bare.dat"; $$files_ref->{fusion_counts_file} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".fusion_counts.dat"; $$files_ref->{splice_counts_bare_file} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".splice_counts_bare.dat"; $$files_ref->{splice_counts_file} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".splice_counts.dat"; $$files_ref->{fusion_counts_with_splice_bare_file} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".fusion_counts_with_splice_bare.dat"; $$files_ref->{fusion_counts_with_splice_bare_file_machine} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".fusion_counts_with_splice_bare.machine.dat"; } sub define_flanking_sequences_file_names { my $tag_ref = shift; my $parameters_ref = shift; my $files_ref = shift; $$files_ref->{flanking_sequences_file} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".flanking_sequences.dat"; $$files_ref->{flanking_splice_sequences_file} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".flanking_splice_sequences.dat"; } sub define_consensus_sequences_file_names { my $tag_ref = shift; my $parameters_ref = shift; my $files_ref = shift; $$files_ref->{consensus_fusion_std_out_file} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".consensus_fusion_std_out.dat"; $$files_ref->{consensus_fusion_std_err_file} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".consensus_fusion_std_err.dat"; $$files_ref->{consensus_splice_std_out_file} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".consensus_splice_std_out.dat"; $$files_ref->{consensus_splice_std_err_file} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".consensus_splice_std_err.dat"; $$files_ref->{fusion_and_splice_consensus_file} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".fusion_and_splice_consensus_sequences.fasta"; } sub define_bam_dedup_files_file_names { my $reads_ref = shift; my $tags_array_ref = shift; my $parameters_ref = shift; my $files_ref = shift; $$files_ref->{bam_dedup_file_1} = $$parameters_ref->{directory} . "/" . $$tags_array_ref[0] . ".dedup.bam"; $$files_ref->{bam_dedup_sorted_file_1_name} = $$parameters_ref->{directory} . "/" . $$tags_array_ref[0] . ".dedup.sorted"; $$files_ref->{bam_dedup_sorted_file_1} = $$parameters_ref->{directory} . "/" . $$tags_array_ref[0] . ".dedup.sorted.bam"; if ( $$reads_ref eq 'paired' ) { $$files_ref->{bam_dedup_file_2} = $$parameters_ref->{directory} . "/" . $$tags_array_ref[1] . ".dedup.bam"; $$files_ref->{bam_dedup_sorted_file_2_name} = $$parameters_ref->{directory} . "/" . $$tags_array_ref[1] . ".dedup.sorted"; $$files_ref->{bam_dedup_sorted_file_2} = $$parameters_ref->{directory} . "/" . $$tags_array_ref[1] . ".dedup.sorted.bam"; } } sub define_sort_sam_files_file_names { my $reads_ref = shift; my $tags_array_ref = shift; my $parameters_ref = shift; my $files_ref = shift; $$files_ref->{sam_on_target_alone_file_1_linux_sorted} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".on_target.alone.linux_sorted.sam"; $$files_ref->{sam_off_target_alone_file_1_linux_sorted} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".off_target.alone.linux_sorted.sam"; $$files_ref->{sam_dedup_on_target_alone_file_1_linux_sorted} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".dedup.on_target.alone.linux_sorted.sam"; $$files_ref->{sam_dedup_off_target_alone_file_1_linux_sorted} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".dedup.off_target.alone.linux_sorted.sam"; $$files_ref->{sam_on_target_file_1_linux_sorted} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".on_target.linux_sorted.sam"; $$files_ref->{sam_off_target_file_1_linux_sorted} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".off_target.linux_sorted.sam"; $$files_ref->{sam_dedup_on_target_file_1_linux_sorted} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".dedup.on_target.linux_sorted.sam"; $$files_ref->{sam_dedup_off_target_file_1_linux_sorted} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".dedup.off_target.linux_sorted.sam"; if ( $$reads_ref eq 'paired' ) { $$files_ref->{sam_on_target_alone_file_2_linux_sorted} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".on_target.alone.linux_sorted.sam"; $$files_ref->{sam_off_target_alone_file_2_linux_sorted} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".off_target.alone.linux_sorted.sam"; $$files_ref->{sam_dedup_on_target_alone_file_2_linux_sorted} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".dedup.on_target.alone.linux_sorted.sam"; $$files_ref->{sam_dedup_off_target_alone_file_2_linux_sorted} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".dedup.off_target.alone.linux_sorted.sam"; $$files_ref->{sam_on_target_file_2_linux_sorted} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".on_target.linux_sorted.sam"; $$files_ref->{sam_off_target_file_2_linux_sorted} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".off_target.linux_sorted.sam"; $$files_ref->{sam_dedup_on_target_file_2_linux_sorted} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".dedup.on_target.linux_sorted.sam"; $$files_ref->{sam_dedup_off_target_file_2_linux_sorted} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".dedup.off_target.linux_sorted.sam"; } $$files_ref->{bam_file_1_full} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".bam.full"; $$files_ref->{bam_file_1_full_prefix} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".bam.full.prefix"; $$files_ref->{bam_file_1_full_sorted} = $$files_ref->{bam_file_1_full_prefix} . ".bam"; $$files_ref->{sam_file_1_full_sorted} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".sam.full.sorted"; if ( $$reads_ref eq 'paired' ) { $$files_ref->{bam_file_2_full} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".bam.full"; $$files_ref->{bam_file_2_full_prefix} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".bam.full.prefix"; $$files_ref->{bam_file_2_full_sorted} = $$files_ref->{bam_file_2_full_prefix} . ".bam"; $$files_ref->{sam_file_2_full_sorted} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".sam.full.sorted"; } $$files_ref->{bam_dedup_file_1_full} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".dedup.bam.full"; $$files_ref->{bam_dedup_file_1_full_prefix} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".dedup.bam.full.prefix"; $$files_ref->{bam_dedup_file_1_full_sorted} = $$files_ref->{bam_dedup_file_1_full_prefix} . ".bam"; $$files_ref->{sam_dedup_file_1_full_sorted} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".dedup.sam.full.sorted"; if ( $$reads_ref eq 'paired' ) { $$files_ref->{bam_dedup_file_2_full} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".dedup.bam.full"; $$files_ref->{bam_dedup_file_2_full_prefix} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".dedup.bam.full.prefix"; $$files_ref->{bam_dedup_file_2_full_sorted} = $$files_ref->{bam_dedup_file_2_full_prefix} . ".bam"; $$files_ref->{sam_dedup_file_2_full_sorted} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".dedup.sam.full.sorted"; } } sub define_on_target_stats_file_names { my $tag_ref = shift; my $parameters_ref = shift; my $files_ref = shift; $$files_ref->{on_target_file} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".on_target.dat"; $$files_ref->{on_target_dedup_file} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".dedup.on_target.dat"; $$files_ref->{reads_per_exon_file} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".reads_per_exon.dat"; $$files_ref->{housekeeping_file} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".housekeeping.dat"; $$files_ref->{reads_per_exon_dedup_file} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".dedup.reads_per_exon.dat"; $$files_ref->{housekeeping_dedup_file} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".housekeeping.dedup.dat"; $$files_ref->{reads_per_exon_dedup_file_machine} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".dedup.reads_per_exon.machine.dat"; } sub define_total_molecule_counts_file_names { my $reads_ref = shift; my $tags_array_ref = shift; my $tag_ref = shift; my $parameters_ref = shift; my $files_ref = shift; $$files_ref->{total_and_aligned_molecule_count_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".total_and_aligned_molecule_count.dat"; if ( $$reads_ref eq 'paired' ) { $$files_ref->{total_and_aligned_molecule_count_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".total_and_aligned_molecule_count.dat"; } $$files_ref->{total_and_aligned_molecule_count_per_tag_file} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".per_tag.total_and_aligned_molecule_count.dat"; } sub define_de_deduplicated_molecule_counts_file_names { my $reads_ref = shift; my $tags_array_ref = shift; my $tag_ref = shift; my $parameters_ref = shift; my $files_ref = shift; $$files_ref->{unique_and_aligned_molecule_count_file_1} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[0] . ".unique_and_aligned_molecule_count.dat"; if ( $$reads_ref eq 'paired' ) { $$files_ref->{unique_and_aligned_molecule_count_file_2} = $$parameters_ref->{directory} . "/" . @$tags_array_ref[1] . ".unique_and_aligned_molecule_count.dat"; } $$files_ref->{unique_and_aligned_molecule_count_per_tag_file} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".per_tag.unique_and_aligned_molecule_count.dat"; } sub define_all_molecule_counts_file_names { my $tag_ref = shift; my $parameters_ref = shift; my $files_ref = shift; $$files_ref->{counts_file} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".counts.dat"; $$files_ref->{counts_file_machine} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".counts.machine.dat"; } sub define_qc_check_file_names { my $tag_ref = shift; my $parameters_ref = shift; my $files_ref = shift; $$files_ref->{qc_filter_file} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".qc_filter.dat"; $$files_ref->{qc_filter_file_machine} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".qc_filter.machine.dat"; } sub define_coverage_uniformity_file_names { my $tag_ref = shift; my $parameters_ref = shift; my $files_ref = shift; $$files_ref->{coverage_uniformity_file} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".coverage_uniformity.dat"; $$files_ref->{coverage_uniformity_file_machine} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".coverage_uniformity.machine.dat"; } sub define_summary_file_names { my $tag_ref = shift; my $parameters_ref = shift; my $files_ref = shift; $$files_ref->{summary_file} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".summary.dat"; $$files_ref->{summary_file_ps} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".summary.ps"; $$files_ref->{summary_file_pdf} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".summary.pdf"; $$files_ref->{summary_file_machine} = $$parameters_ref->{directory} . "/" . $$tag_ref . ".summary.machine.dat"; }