Mercurial > repos > clifinder > clifinder
view script/CLIFinder.pl @ 20:f25d12179c6c draft
"planemo upload for repository https://github.com/GReD-Clermont/CLIFinder/ commit d5ec4f62fa3d1d52508e07e1221a0c22f0d615bf"
author | clifinder |
---|---|
date | Thu, 12 Mar 2020 18:01:10 -0400 |
parents | 5c16e8ddff78 |
children | 7bef1cd4d2ad |
line wrap: on
line source
#!/usr/bin/env perl ################################################ #Declaration of necessary libraries############# ################################################ use strict; use warnings; use Parallel::ForkManager; use POSIX; use Statistics::R; use Getopt::Long qw(HelpMessage VersionMessage); use File::Basename; use File::Copy::Recursive; use FindBin qw($Bin); use Archive::Tar; our $VERSION = '0.5.1'; ##################################################################### #Definition options of execution according to the previous variables# ##################################################################### GetOptions( "first|1=s" => \my @fastq1, "second|2=s" => \my @fastq2, "name=s" => \my @name, "html=s" => \my $html, "html_path=s" => \my $html_repertory, "TE=s" => \my $TE, "ref=s" => \my $ref, "rnadb:s" => \my $rna_source, "estdb:s" => \my $est_source, "build_TE" => \my $build_TE, "build_ref" => \my $build_ref, "build_rnadb" => \my $build_rnadb, "build_estdb" => \my $build_estdb, "rmsk=s" => \my $rmsk_source, "refseq=s" => \my $refseq, "species=s" => \(my $species = "human"), "min_unique:i" => \(my $prct = 33), "size_insert:i" => \(my $maxInsertSize = 250), "size_read:i" => \(my $size_reads = 100), "BDir:i" => \(my $Bdir = 0), "min_L1:i" => \(my $min_L1 = 50), "mis_L1:i" => \(my $mis_L1 = 1), "threads:i" => \(my $threads = 1), "help" => sub { HelpMessage(0); }, "version" => sub { VersionMessage(0); }, ) or HelpMessage(1); HelpMessage(1) unless @fastq1 && @fastq2 && @name && defined($TE) && defined($ref) && defined($rmsk_source) && defined($refseq) && defined($html) && defined($html_repertory); my $iprct = 100 - (($prct / $size_reads)*100) ; my $mis_auth = $size_reads - $min_L1 + $mis_L1 ; my $eprct = ($iprct * $size_reads) /100; my $dprct = ((100-$iprct) * $size_reads) / 100; ################################################ #Clean up names and species # ################################################ foreach(@name) { $_ =~ s/[^A-Za-z0-9_\-\.]/_/g; } $species =~ s/[;&|]//g; ################################################ #Construct index of ref and TE if doesn't exist# ################################################ `(bwa index '$ref')` if ($build_ref); `(bwa index '$TE')` if ($build_TE); ############################################ #Create repository to store resulting files# ############################################ mkdir $html_repertory; ########################################## #Define hash # ########################################## my %frag_exp_id; ########################## #Data file we have to use# ########################## print STDOUT "Extracting data from rmsk file\n"; my $line_only=$html_repertory.'/'.'line_only.txt'; my $rmsk = $html_repertory.'/rmsk.bed'; filter_convert_rmsk($rmsk_source, $rmsk, $line_only); ############################## # Analyse of each fastq file # ############################## my @garbage; my $num = 0; foreach my $tabR (0..$#fastq1) { ################################################### # Paired end mapping against L1 promoter sequences# ################################################### ## Align reads on L1 but only keep half-mapped pairs print STDOUT "Alignment of $name[$tabR] to L1\n"; my $sam = $html_repertory.'/'.$name[$tabR].'_L1.sam'; push(@garbage, $sam); halfmap_paired($TE, $fastq1[$tabR], $fastq2[$tabR], $sam, $threads, $mis_auth); print STDOUT "Alignment done\n"; ## Filter alignments based on mis_L1 and min_L1 print STDOUT "Filtering alignments based on mis_L1 and min_L1\n"; my $filtered_sam = $html_repertory.'/'.$name[$tabR].'_L1_filtered.sam'; push(@garbage, $filtered_sam); filter_halfmapped($sam, $filtered_sam, $mis_L1, $min_L1); ################################################## # Creation of two fastq for paired halfed mapped:# # - _1 correspond to sequences mapped to L1 # # - _2 correspond to sequences unmapped to L1 # ################################################## # Half-mapped reads my $hm_reads_1 = $html_repertory.'/'.$name[$tabR].'_halfmapped_1.fastq'; push(@garbage, $hm_reads_1); my $hm_reads_2 = $html_repertory.'/'.$name[$tabR].'_halfmapped_2.fastq'; push(@garbage, $hm_reads_2); ## Split mate that matched to L1 and others## my $half_num_out = get_halfmapped_reads($filtered_sam, $Bdir, $hm_reads_1, $hm_reads_2); print STDOUT "Number of half mapped pairs: $half_num_out\n"; ## Get pairs after repeatmasker on the other mate print STDOUT "Getting pairs with one mate matched to L1 and the other mate undetected by repeatmasker as a repeat sequence\n"; # Filtered reads after repeatmasker my $out_ASP_1 = $html_repertory.'/'.$name[$tabR].'_1.fastq'; push(@garbage, $out_ASP_1); my $out_ASP_2 = $html_repertory.'/'.$name[$tabR].'_2.fastq'; push(@garbage, $out_ASP_2); my $left = sort_out($threads, $species, $hm_reads_1, $hm_reads_2, $out_ASP_1, $out_ASP_2, $dprct, $eprct, $html_repertory); print STDOUT "Number of pairs after repeatmasker: $left\n"; ################################################## # Alignment of halfed mapped pairs on genome # ################################################## ## Align filtered reads on genome print STDOUT "Alignment of potential chimeric sequences to the genome\n"; $sam = $html_repertory.'/'.$name[$tabR].'_genome.sam'; push(@garbage, $sam); align_genome($ref, $out_ASP_1, $out_ASP_2, $sam, $maxInsertSize, $threads); print STDOUT "Alignment done\n"; ## Compute the number of sequences obtained after alignment ## $left = `samtools view -@ $threads -Shc '$sam'`; chomp $left; $left = $left/2; print STDOUT "Number of sequences: $left\n"; ################################################## # Create bedfiles of potential chimerae # # and Know repeat sequences removed # ################################################## print STDOUT "Looking for chimerae\n"; results($html_repertory, $sam, $name[$tabR], $iprct, \%frag_exp_id, $rmsk, $num, \@garbage); $num++; } ##define files variables ## my $repfirst = $html_repertory.'/first.bed'; push(@garbage,$repfirst); my $repsecond = $html_repertory.'/second.bed'; push(@garbage,$repsecond); my $repMfirst = $html_repertory.'/firstM.bed'; push(@garbage,$repMfirst); my $repMsecond = $html_repertory.'/secondM.bed'; push(@garbage,$repMsecond); #my $covRepMsecond = $html_repertory.'/covSecondM.bed'; push(@garbage,$covRepMsecond); ##Concatenate all files for first and second mate results ## `cat $html_repertory/*-first.bed > '$repfirst'`; #*/ `cat $html_repertory/*-second.bed > '$repsecond'`; #*/ ## Sort Files and generate files that merge reads in the same locus ## print STDOUT "Sort files and merge reads in the same locus\n"; `bedtools sort -i '$repfirst' | bedtools merge -c 4,5 -o collapse,max -d 100 -s > '$repMfirst'`; `bedtools sort -i '$repsecond' | bedtools merge -c 4,5 -o collapse,max -d 100 -s > '$repMsecond'`; my (%frag_uni, @second_R, @second_exp, @results); my $merge_target = $html_repertory.'/target_merged.bed'; push(@garbage, $merge_target); my $merge = $html_repertory.'/merged.bed'; push(@garbage, $merge); open(my $mT, '>', $merge_target) || die "Cannot open $merge_target\n"; open(my $m, '>', $merge) || die "Cannot open $merge\n"; open(my $in, '<', $repMsecond) || die "Cannot open $repMsecond\n"; my $cmp = 0; while (<$in>) { chomp $_; my @tmp = (0) x scalar(@fastq1); my @line = split /\t/, $_; my @names =split /,/, $line[4]; foreach my $n (@names){$n =~/(.*?)\/[12]/; $frag_uni{$1} = $cmp; $tmp[$frag_exp_id{$1}]++; } $second_exp[$cmp] = \@tmp; $cmp++; push @second_R, [$line[0],$line[1],$line[2],$line[3]]; } $cmp = 0; open($in, '<', $repMfirst) || die "Cannot open $repMfirst\n"; while (<$in>) { chomp $_; my %sec; my @line = split /\t/, $_; my @names =split /,/, $line[4]; my @tmp = (0) x scalar(@fastq1); foreach my $n (@names){$n =~/(.*?)\/[12]/; $tmp[$frag_exp_id{$1}]++; } foreach my $n (@names) { $n =~/(.*?)\/[12]/; unless (exists ($sec{$frag_uni{$1}}) ) { my @lmp = ($line[0], $line[1], $line[2], $line[3]); foreach my $exp_N (@tmp){ push @lmp, $exp_N;} push (@lmp, $second_R[$frag_uni{$1}]->[0], $second_R[$frag_uni{$1}]->[1], $second_R[$frag_uni{$1}]->[2], $second_R[$frag_uni{$1}]->[3]); foreach my $exp_N (@{$second_exp[$frag_uni{$1}]}){ push @lmp, $exp_N;} my $name = $cmp.'-'.$second_R[$frag_uni{$1}]->[0].'-'.$second_R[$frag_uni{$1}]->[1].'-'.$second_R[$frag_uni{$1}]->[2]; print $mT $second_R[$frag_uni{$1}]->[0]."\t".$second_R[$frag_uni{$1}]->[1]."\t".$second_R[$frag_uni{$1}]->[2]."\t$name\t29\t".$second_R[$frag_uni{$1}]->[3]."\n"; my ($b, $e) = (0,0); if ($line[1] < $second_R[$frag_uni{$1}]->[1]) { $b = $line[1] - 1000; $e = $second_R[$frag_uni{$1}]->[2] + 1000; $name = $cmp.'-'.$line[0].'-'.$b.'-'.$e; print $m $line[0]."\t".$b."\t".$e."\t$name\t29\t".$second_R[$frag_uni{$1}]->[3]."\n"; } else { $b = $second_R[$frag_uni{$1}]->[1] - 1000; $e = $line[2] + 1000; $name = $cmp.'-'.$line[0].'-'.$b.'-'.$e; print $m $line[0]."\t".$b."\t".$e."\t$name\t29\t".$second_R[$frag_uni{$1}]->[3]."\n"; } $results[$cmp] = \@lmp; $cmp++; } $sec{$frag_uni{$1}} = undef; } } close $mT; close $m; my $fasta = $html_repertory.'/target_merged.fasta'; push(@garbage, $fasta); my $extend = $html_repertory.'/extend.fasta'; push(@garbage, $extend); `bedtools getfasta -name -fi '$ref' -bed '$merge' -fo '$extend'`; `bedtools getfasta -name -fi '$ref' -bed '$merge_target' -fo '$fasta'`; ################################################ #Blast against human rna and est, if provided # ################################################ my $rna; my $est; if(defined($rna_source)) { ##get databases for est and rna print STDOUT "Getting blast databases for rna\n"; my $rna_db = get_blastdb_from_source($rna_source, $build_rnadb, 'rna', $html_repertory); print STDOUT "Blast against human rna\n"; my $tabular = $html_repertory.'/chimerae_rna.tab'; push(@garbage, $tabular); blast($rna_db, $fasta, $tabular, $threads); $rna = extract_blast($tabular); # Clean RNA blast database if in html dir if(rindex($rna_db, $html_repertory, 0) == 0) { my $toErase = $rna_db.'.*'; unlink glob "$toErase"; } } if(defined($est_source)) { print STDOUT "Getting blast databases for est\n"; my $est_db = get_blastdb_from_source($est_source, $build_estdb, 'est', $html_repertory); print STDOUT "Blast against human est\n"; my $tabular2 = $html_repertory.'/chimerae_est.tab'; push(@garbage, $tabular2); blast($est_db, $fasta, $tabular2, $threads); $est = extract_blast($tabular2); # Clean EST blast database if in html dir if(rindex($est_db, $html_repertory, 0) == 0) { my $toErase = $est_db.'.*'; unlink glob "$toErase"; } } ################################################ #Create Results html file # ################################################ print STDOUT "Save results in file\n"; save_csv(\@fastq1, \@name, \@results, $line_only, $refseq, $html_repertory); print STDOUT "Create HTML\n"; html_tab(\@fastq1, \@name, \@results, $rna, $est, $html, $html_repertory); $extend = $extend.'*'; push(@garbage, glob($extend)); push(@garbage, $line_only); push(@garbage, $rmsk); unlink @garbage; print STDOUT "Job done!\n"; ########################################### END MAIN ########################################################## ############################################################################################################## ############################################ FUNCTIONS ################################################# ############################################################################################################## ############################################################ ## Function to convert rmsk table to bed and line_only ##### ############################################################ ## @param: # ## $source: rmsk table file # ## $bed: rmsk bed file # ## $line_only: rmsk table file with only LINE # ############################################################ sub filter_convert_rmsk { my ($source, $bed, $line_only) = @_; open(my $input, '<', $source) || die "Cannot open rmsk file! $!\n"; ## Open source file open(my $bedfile, '>', $bed) || die "Cannot open output bed file for rmsk! $!\n"; ## Open bed file open(my $linefile, '>', $line_only) || die "Cannot open output LINE-only file for rmsk! $!\n"; ## Open line_only file my @headers; my %indices; print $linefile "#filter: rmsk.repClass = 'LINE'\n"; while(<$input>) { chomp $_; if($. == 1) { if(substr($_, 0, 1) ne "#") { die "rmsk file does not have header starting with #\n"; } else { print $linefile "$_\n"; my $firstline = substr($_, 1); @headers = split(/\t/, $firstline); @indices{@headers} = 0..$#headers; } } else { my @line = split(/\t/,$_); if($line[$indices{"repClass"}] eq "LINE") { print $linefile "$_\n"; } print $bedfile "$line[$indices{'genoName'}]\t$line[$indices{'genoStart'}]\t$line[$indices{'genoEnd'}]\t$line[$indices{'repName'}]\t$line[$indices{'swScore'}]\t$line[$indices{'strand'}]\n"; } } close $input; close $bedfile; close $linefile; } ############################################################ ##Function to align paired-end reads on a genome ######## ############################################################ ## @param: # ## $index: referential genome # ## $fasq1: first paired end file # ## $fasq2: second paired end file # ## $sam: alignment output file # ## $maxInsertSize: maximum size of insert # ## $threads: number of threads used # ############################################################ sub align_genome { my ($index, $fastq1, $fastq2, $sam, $maxInsertSize, $threads) = @_ ; my @L_garbage =(); my $sai1 = $sam.'_temporary.sai1'; push @L_garbage,$sai1; my $sai2 = $sam.'_temporary.sai2'; push @L_garbage,$sai2; `bwa aln -o4 -e1000 -t $threads '$index' '$fastq1' > '$sai1'`; `bwa aln -o4 -e1000 -t $threads '$index' '$fastq2' > '$sai2'`; ## -A force the insertion size `bwa sampe -s -A -a $maxInsertSize '$index' '$sai1' '$sai2' '$fastq1' '$fastq2' | samtools view -@ $threads -F4 -f 2 -Sh /dev/stdin -o '$sam'`; unlink @L_garbage; } ############################################################ ##Function to get half-mapped paired-end reads on a ref # ############################################################ ## @param: # ## $index: referential file # ## $fasq1: first file paired end reads # ## $fasq2: second file paired end reads # ## $sam: output alignment file # ## $threads: number of threads used # ## $mis: tolerated mismatches # ############################################################ sub halfmap_paired { my ($index, $fastq1, $fastq2, $sam, $threads, $mis) = @_ ; my @garbage = (); my $sai1 = $sam.'_temporary.sai1'; push(@garbage,$sai1); my $sai2 = $sam.'_temporary.sai2'; push(@garbage,$sai2); ##alignement with bwa `bwa aln -n $mis -t $threads '$index' '$fastq1' > '$sai1'`; `bwa aln -n $mis -t $threads '$index' '$fastq2' > '$sai2'`; `bwa sampe '$index' '$sai1' '$sai2' '$fastq1' '$fastq2' | samtools view -@ $threads -h -F 2 -G 12 -o '$sam'`; ## delete temporary single aligned files unlink @garbage; } ############################################################ ##Function filter_halfmapped to filter half-mapped pairs ### ############################################################ ## @param: # ## $sam: name of alignment file # ## $filteredsam: name of filtered alignment file # ## $mis_L1: maximum number of mismatches # ## $min_L1: minimum number of bp matching # ############################################################ sub filter_halfmapped { ## store name of file my $sam = shift; my $filtered_sam = shift; my $mis_L1 = shift; my $min_L1 = shift; open(my $in, '<', $sam) || die "Cannot open $sam file! ($!)\n"; open(my $out, '>', $filtered_sam) || die "Cannot open $filtered_sam file! ($!)\n"; ##read file## while(<$in>) { chomp $_; ##Find if alignments have min_L1 consecutives bases mapped on R1 ## if ($_ =~/MD:Z:(.*)/) { my $MD = $1; $MD = $1 if ($MD =~ /(.*)\t/); $MD =~ s/\^//g; my @tab = split /[ATCG]/,$MD; my $tot = 0; my $accept = 0; if( $mis_L1+1 < scalar(@tab) ) { splice(@tab, $mis_L1+1); foreach my $elt (@tab) { $tot += int($elt) if($elt ne ''); } next if $tot < $min_L1; } } print $out "$_\n"; } close $in; close $out; } ############################################################ ##Function get_halfmapped_reads to get half-mapped reads ### ############################################################ ## @param: # ## $sam: name of alignment file # ## $Bdir: reads orientation # ## $fastq1: fastq file with matching reads (1) # ## $fastq2: fastq file with matching reads (2) # ## # ## @return: # ## $half_num_out: number of alignment saved # ############################################################ sub get_halfmapped_reads { my $sam = shift; my $Bdir = shift; my $fastq1 = shift; my $fastq2 = shift; my $half_num_out = 0; # Generate fastq files my $report; if($Bdir == 2) { $report = `samtools sort -n '$sam' -O SAM | samtools view -h -G 72 | samtools fastq -G 132 -1 '$fastq2' -2 '$fastq1' -0 /dev/null -s /dev/null /dev/stdin 2>&1 > /dev/null`; } elsif($Bdir == 1) { $report = `samtools sort -n '$sam' -O SAM | samtools view -h -G 136 | samtools fastq -G 68 -1 '$fastq1' -2 '$fastq2' -0 /dev/null -s /dev/null /dev/stdin 2>&1 > /dev/null`; } else { $report = `samtools fixmate '$sam' /dev/stdout | samtools sort -n -O SAM | samtools fastq -n -f 9 -F 4 /dev/stdin 2>&1 > '$fastq1'`; my $report2 = `samtools fixmate '$sam' /dev/stdout | samtools sort -n -O SAM | samtools fastq -n -f 5 -F 8 /dev/stdin 2>&1 > '$fastq2'`; $report = $report.$report2; } my @processed = $report =~ m/processed\ (\d+)\ reads/; if(defined($processed[0])) { $half_num_out = int($processed[0]); } if(defined($processed[1]) && $processed[1] ne $processed[0]) { print STDERR "Number of half-mapped reads in file _1 and _2 not matching.\n"; $half_num_out = int($processed[$processed[0]>$processed[1]]); # Get min value } print STDERR $report; return $half_num_out; } ############################################################ ##Function sort_out: extract paired end reads ### ############################################################ ## @param: # ## $threads: number of threads used # ## $species: species RepeatMasker should use # ## $in1: input file halfmapped 1 # ## $in2: input file halfmapped 2 # ## $out1: output file accepted 1 # ## $out2: output file accepted 2 # ## $dprct: number of bp not annotated by RepeatMasker# ## $eprct: number of repeated bases tolerated # ## $html_repertory: folder for html files # ############################################################ sub sort_out { my ($threads, $species, $in1, $in2, $out1, $out2, $dprct, $eprct, $html_repertory) = @_; my ($name,$path) = fileparse($out2,'.fastq'); my %repeat; my @garbage = (); my $cmp = 0; my $repout = $html_repertory.'/'.$name.'.repout/'; my $list = $html_repertory.'/'.$name.'.list'; push(@garbage, $list); my $fa = $html_repertory.'/'.$name.'.fa'; push(@garbage, $fa); mkdir $repout; my %notLine; ## Transform fastq file to fasta `fastq_to_fasta -i '$in2' -o '$fa' -Q33`; ##Launch RepeatMasker on fasta file `RepeatMasker -s -pa $threads -dir '$repout' -species "$species" '$fa'`; my $repfile = $repout.$name.'.fa.out'; open(my $rep, '<', $repfile) || die "Cannot open $repfile ($!)\n"; while(<$rep>) { chomp; ## test the percent of repeats ## my $string = $_; $string =~ s/^\s+//; next unless ($string =~ /^\d/); $string =~ s/\s+/ /g; my @line = split (' ',$string); if ( exists($repeat{$line[4]}) ) { $repeat{$line[4]}->[0] = $line[5] if $repeat{$line[4]}->[0] > $line[5]; $repeat{$line[4]}->[1] = $line[6] if $repeat{$line[4]}->[1] < $line[6]; } else{ $repeat{$line[4]} = [$line[5],$line[6]];} } close $rep; ## store in list if pair passed the repeat test ## open(my $fq, '<', $in2) || die "Cannot open $in2 ($!)\n"; open(my $lst, '>', $list) || die "Cannot open $list ($!)\n"; while(<$fq>) { chomp $_; next unless(index($_, '@') == 0 && $. % 4 == 1); my $seqname = substr($_, 1); my $repseq = $repeat{$seqname}; unless(defined($repseq) && ($repseq->[0] <= $dprct && $repseq->[1] >= $eprct)) { print $lst "$seqname\n"; $cmp++; } } close $fq; close $lst; ##write resulting reads in both files for paired ## `seqtk subseq '$in1' '$list' > '$out1'`; `seqtk subseq '$in2' '$list' > '$out2'`; ##drop files and directories generated by repeatmasker## my $toErase = $repout.'*'; unlink glob "$toErase"; unlink @garbage; rmdir $repout; return $cmp; } ############################################################ ##Function results computes bed files for result ### ############################################################ ## @param: # ## $out_repository: repository to store results # ## $file: sam file resulting of alignement # ## $name: name of paireds end reads file # ## $iprct: percentage of repeats tolerated # ## $hashRef: store number for each first read value # ## $rmsk: UCSC repeat sequences # ## $ps: number of the paired end file # ## $garbage_ref: reference to garbage array # ############################################################ sub results { my ($out_repertory, $file, $name, $iprct, $hashRef, $rmsk, $ps, $garbage_ref) = @_; my $tempnamefirst = $out_repertory.'/'.$name.'-first.tmp.bed'; push(@$garbage_ref, $tempnamefirst); my $tempnamesecond = $out_repertory.'/'.$name.'-second.tmp.bed'; push(@$garbage_ref, $tempnamesecond); my $namefirst = $out_repertory.'/'.$name.'-first.bed'; push(@$garbage_ref, $namefirst); my $namesecond = $out_repertory.'/'.$name.'-second.bed'; push(@$garbage_ref, $namesecond); ## store reads mapped in proper pair respectively first and second in pair in bam files and transform in bed files## `samtools view -Sb -f66 '$file' | bedtools bamtobed -i /dev/stdin > '$tempnamefirst'`; `samtools view -Sb -f130 '$file' | bedtools bamtobed -i /dev/stdin > '$tempnamesecond'`; ##compute converage of second mate on rmsk## my $baseCov = 0; my %IdCov = (); my @coverage = `bedtools coverage -a '$tempnamesecond' -b '$rmsk'`; ## store coverage fraction ## foreach my $covRmsk (@coverage) { chomp $covRmsk; my @split_cov = split /\t/, $covRmsk; ##generate identifier for IdCov ## $split_cov[3] =~ /(.*?)\/[12]/; ##store value in IdCov ## if (!exists($IdCov{$1})) { $IdCov{$1} = $split_cov[-1]; } else { $IdCov{$1} = $split_cov[-1] if $split_cov[-1] > $IdCov{$1}; } } ## get only first mate that have less tant $iprct repeats ## open(my $tmp_fi, '<', $tempnamefirst) || die "Cannot open $tempnamefirst!\n"; open(my $nam_fi, '>', $namefirst) || die "Cannot open $namefirst!\n"; while (<$tmp_fi>) { my @line = split /\t/, $_; $line[3] =~ /(.*?)\/[12]/; if ($IdCov{$1} <= $iprct/100) { print $nam_fi $_; ${$hashRef}{$1}= $ps; } } close $tmp_fi; close $nam_fi; ## get only second mate that have less than $iprct repeats ## open(my $tmp_sec, '<', $tempnamesecond) || die "Cannot open $tempnamesecond!\n"; open(my $nam_sec, '>', $namesecond) || die "Cannot open $namesecond!\n"; while (<$tmp_sec>) { my @line = split /\t/, $_; $line[3] =~ /(.*?)\/[12]/; if ($IdCov{$1} <= $iprct/100) { print $nam_sec $_; } } close $tmp_sec; close $nam_sec; } ############################################################ ## Function to get blast db from the specified source ###### ############################################################ ## @param: # ## $source: db source (URL or path) # ## $build_db: whether the db should be created # ## $name: name of the db that could be created # ## $dest_dir: where the data can be placed # ## @return: # ## $path: blast db path # ############################################################ sub get_blastdb_from_source { my ($source, $build_db, $name, $dest_dir) = @_; # Assume source is just db path my $path = $source; my ($file) = $path =~ m~([^/\\]*)$~; my $dbname = $file; my @garbage; if($build_db) { $dbname = $name; $path = $dest_dir.'/'.$name; print STDOUT "Making $dbname blast database\n"; `makeblastdb -in '$source' -dbtype nucl -out '$path'`; } else { # Check if source is URL if(index($source, ":/") != -1) { my $url = $source; if($file =~ /\*/) { $url =~ s/\Q$file//; print STDOUT "Downloading blast database from $url\n"; `wget -q -N -r -nH -nd -np --accept=$file $url -P '$dest_dir'`; # Assume regexp matches db name $dbname =~ s/\*$//; } else { print STDOUT "Downloading blast database from $url\n"; `wget -q -N $source -P '$dest_dir'`; push(@garbage, $dest_dir.'/'.$file); } if($? == 0) { print "Downloaded database\n"; } else { print "Error while downloading database\n"; } $path = $dest_dir.'/'.$dbname; } if(index($file, ".") != -1) { if(index($file, ".tar") != -1) { ## Extract tar files print STDOUT "Extracting blast database from $file\n"; my @properties = ('name'); my $tar=Archive::Tar->new(); $tar->setcwd($dest_dir); $tar->read($path); my @files = $tar->list_files(\@properties); $tar->extract(); $tar->clear(); unlink @garbage; ## Get dbname from filenames my @parts = split(/\./, $files[0]); $dbname = $parts[0]; $path = $dest_dir.'/'.$dbname; print STDOUT "Extracted database\n"; } else { print STDOUT "Unexpected file format for database" } } } print "Using $dbname database\n"; return $path; } ############################################################ ##Function blast: blast nucleotide sequences on ref ### ############################################################ ## @param: # ## $db: database where to search # ## $fasta: file containing nucleotide sequences # ## $tabular: out file name # ## $threads: number of threads used # ############################################################ sub blast { my ($db, $fasta, $tabular, $threads) = @_; `blastn -db '$db' -query '$fasta' -num_threads $threads -out '$tabular' -outfmt 6 -evalue 10e-10`; } ############################################################ ##Function extract_blast: extract result from blast ### ############################################################ ## @param: # ## $file: name of sequences file # ## @return: hash that contains sequences # ############################################################ sub extract_blast { my $file = shift; my %hash = (); open(my $f, '<', $file) || die "Cannot open $file\n"; while (<$f>) { chomp $_; my ($seq,$id) = split /\t/,$_; chomp $id; $seq = $1 if ($seq =~ /(\d+)-(.*?)-(\d+)-(\d+)/); chomp $seq; $hash{$seq} = [] unless exists $hash{$seq}; push @{$hash{$seq}}, $id; } close $f; return \%hash; } ############################################################ ##Function print_header: header of html file ### ############################################################ ## @param: # ############################################################ sub print_header { my $fileR = shift; my $title = shift; print $fileR "<!DOCTYPE html>\n<html>\n<head>\n\t<title>$title</title>\n"; print $fileR "\t<style type=\"text/css\">\n"; print $fileR "\t\tbody { font-family:Arial, Helvetica, Sans-Serif; font-size:0.8em;}\n"; print $fileR "\t\t#report { border-collapse:collapse;}\n"; print $fileR "\t\t#report h4 { margin:0px; padding:0px;}\n"; print $fileR "\t\t#report img { float:right;}\n"; print $fileR "\t\t#report ul { margin:10px 0 10px 40px; padding:0px;}\n"; print $fileR "\t\t#report th { background:#7CB8E2 url(static/images/header_bkg.png) repeat-x scroll center left; color:#fff; padding:7px 15px; text-align:left;}\n"; print $fileR "\t\t#report td { background:#C7DDEE none repeat-x scroll center left; color:#000; padding:7px 15px; }\n"; print $fileR "\t\t#report tr.odd td { background:#fff url(static/images/row_bkg.png) repeat-x scroll center left; cursor:pointer; }\n"; print $fileR "\t\t#report div.arrow { background:transparent url(static/images/arrows.png) no-repeat scroll 0px -16px; width:16px; height:16px; display:block;}\n"; print $fileR "\t\t#report div.up { background-position:0px 0px;}\n"; print $fileR "\t</style>\n"; print $fileR "\t<script src=\"./js/jquery.min.js\" type=\"text/javascript\"></script>\n"; print $fileR "\t<script type=\"text/javascript\">\n"; print $fileR "\t\t\$(document).ready(function(){\n"; print $fileR "\t\t\t\$(\"#report tr:odd\").addClass(\"odd\");\n"; print $fileR "\t\t\t\$(\"#report tr:not(.odd)\").hide();\n"; print $fileR "\t\t\t\$(\"#report tr:first-child\").show();\n"; print $fileR "\t\t\t\$(\"#report tr.odd\").click(function(){\n"; print $fileR "\t\t\t\t\$(this).next(\"tr\").toggle();\n"; print $fileR "\t\t\t\t\$(this).find(\".arrow\").toggleClass(\"up\");\n"; print $fileR "\t\t\t});\n"; print $fileR "\t\t\t//\$(\"#report\").jExpand();\n"; print $fileR "\t\t});\n\t</script>\n"; print $fileR "</head>\n<body>\n\t<table id=\"report\">\n"; } ############################################################ ##Function html_tab: definition of html file ### ############################################################ ## @param: # ## $fastq1_ref: reference to first paired end files # ## $name_ref: reference to names of reads files # ## $results_ref: reference to results files # ## $rna: results for known RNA # ## $est: results for known EST # ## $html: html results file # ## $html_repertory: repository to store results # ############################################################ sub html_tab { my ($fastq1_ref, $name_ref, $results_ref, $rna, $est, $html, $html_repertory) = @_; my $out = $html_repertory; my @fastq1 = @{$fastq1_ref}; my @name = @{$name_ref}; my @results = @{$results_ref}; my ($rna_col, $est_col) = ('',''); $rna_col = "Known RNA" if(defined($rna)); $est_col = "Known EST" if(defined($est)); # Copy HTML resources to results folder File::Copy::Recursive::dircopy "$Bin/js/", "$out/js" or die "Copy failed: $!"; File::Copy::Recursive::dircopy "$Bin/static/", "$out/static" or die "Copy failed: $!"; my $chimOut = $html; open(my $tab, '>', $chimOut) || die "Cannot open $chimOut"; print_header($tab, "Chimerae"); print $tab "\t\t<tr>\n\t\t\t<th>L1 chromosome</th>\n\t\t\t<th>L1 start</th>\n\t\t\t<th>L1 end</th>\n\t\t\t<th>L1 strand</th>\n"; for my $i (0..$#fastq1) { print $tab "\t\t\t<th>$name[$i] read #</th>\n"; } print $tab "\t\t\t<th>Chimera chromosome</th>\n\t\t\t<th>Chimera start</th>\n\t\t\t<th>Chimera end</th>\n\t\t\t<th>Chimera strand</th>\n"; for my $i (0..$#fastq1) { print $tab "\t\t\t<th>$name[$i] read #</th>\n"; } print $tab "\t\t\t<th>$rna_col</th>\n\t\t\t<th>$est_col</th>\n\t\t\t<th></th>\n\t\t</tr>\n"; for my $i (0..$#results) { print $tab "\t\t<tr>\n"; foreach my $j (@{$results[$i]}) { print $tab "\t\t\t<td>$j</td>\n"; } my ($Hrna, $Hest) = ('',''); $Hrna = "<a target=\"_blank\" rel=\"noopener noreferrer\" href=\"https://www.ncbi.nlm.nih.gov/nuccore/${$rna}{$i}[0]\">${$rna}{$i}[0]</a>" if exists(${$rna}{$i}); $Hest = "<a target=\"_blank\" rel=\"noopener noreferrer\" href=\"https://www.ncbi.nlm.nih.gov/nuccore/${$est}{$i}[0]\">${$est}{$i}[0]</a>" if exists(${$est}{$i}); print $tab "\t\t\t<td>$Hrna</td>\n\t\t\t<td>$Hest</td>\n\t\t\t<td><div class=\"arrow\"></div></td>\n\t\t</tr>\n"; my $colspan = scalar(@fastq1) * 2 + 8; print $tab "\t\t<tr>\n\t\t\t<td valign=top colspan=$colspan></td>\n\t\t\t<td valign=top>\n"; if (exists(${$rna}{$i})) { for (my $w = 1; $w <= $#{${$rna}{$i}}; $w++) { $Hrna = "<a target=\"_blank\" rel=\"noopener noreferrer\" href=\"https://www.ncbi.nlm.nih.gov/nuccore/${$rna}{$i}[$w]\">${$rna}{$i}[$w]</a>"; print $tab "\t\t\t\t$Hrna<br>\n"; } delete ${$rna}{$i}; } print $tab "\t\t\t</td>\n\t\t\t<td valign=top>\n"; if (exists (${$est}{$i})) { for (my $w = 1; $w <= $#{${$est}{$i}}; $w++) { $Hest = "<a target=\"_blank\" rel=\"noopener noreferrer\" href=\"https://www.ncbi.nlm.nih.gov/nuccore/${$est}{$i}[$w]\">${$est}{$i}[$w]</a>"; print $tab "\t\t\t\t$Hest<br>\n"; } delete ${$est}{$i}; } print $tab "\t\t\t</td>\n\t\t\t<td></td>\n\t\t</tr>\n"; } print $tab "\t</table>\n</body>\n</html>\n"; close $tab; } ############################################################ ##Function save_csv: save results in different formats ### ############################################################ ## @param: # ## $fastq1_ref: reference to first paired end files # ## $name_ref: reference to names of reads files # ## $results_ref: reference to results files # ## $line_only: Line only database # ## $refseq: refseq text file # ## $out: repository to store results # ############################################################ sub save_csv{ my ($fastq1_ref, $name_ref, $results_ref, $line_only, $refseq, $out) = @_; my @fastq1 = @{$fastq1_ref}; my @name = @{$name_ref}; my @results = @{$results_ref}; my $out1= $out.'/results.txt'; my $out2= $out.'/first_results.txt'; my $out3= $out.'/final_result_chimerae.txt'; # save result in csv file ## my $filed = $out1; open(my $tab, '>', $filed) || die "Cannot open $filed"; print $tab "L1 chromosome\tL1 start\tL1 end\tL1 strand";; for my $i (0..$#fastq1) { print $tab "\t$name[$i] read #"; } print $tab "\tChimera chromosome\tChimera start\tChimera end\tChimera strand"; for my $i (0..$#fastq1) { print $tab "\t$name[$i] read #"; } print $tab "\n"; for my $i ( 0 .. $#results ) { my $rowref = $results[$i]; my $n = @$rowref - 1; for my $j ( 0 .. $n-1 ) { print $tab "$results[$i][$j]\t"; } print $tab "$results[$i][$n]\n"; } close $tab; ##Add some information via R Scripts## # Create bridge between Perl and R my $R = Statistics::R->new(); $R->start(); $R->set('out1', $out1); $R->set('out2', $out2); $R->set('out3', $out3); $R->set('nfastq', $#fastq1); $R->set('line_only', $line_only); $R->set('refseq', $refseq); my $R_out = $R->run_from_file("$Bin/CLIFinder_results.R"); $R->stop(); print STDOUT "$R_out\n"; } __END__ =head1 NAME CLIFinder - Identification of L1 Chimeric Transcripts in RNA-seq data =head1 SYNOPSIS CLIFinder.pl --first <first fastq of paired-end set 1> --name <name 1> --second <second fastq of paired-end set 1> [--first <first fastq of paired-end set 2> --name <name 2> --second <second fastq of paired-end set 2> ...] --ref <reference genome> [--build_ref] --TE <transposable elements> [--build_TE] --html <results.html> --html-path <results directory> [options] Arguments: --first <fastq file> First fastq file to process from paired-end set --name <name> Name of the content to process --second <fastq file> Second fastq file to process from paired-end set --ref <reference> Fasta file containing the reference genome --TE <TE> Fasta file containing the transposable elements --rmsk <text file> Tab-delimited text file (with headers) containing reference repeat sequences (e.g. rmsk track from UCSC) --refseq <text file> Tab-delimited file (with headers) containing reference genes (e.g. RefGene.txt from UCSC) --html <html file> Main HTML file where results will be displayed --html_path <dir> Folder where results will be stored For any fasta file, if a bwa index is not provided, you should build it through the corresponding '--build_[element]' argument Options: --rnadb <RNA db> Blast database containing RNA sequences (default: empty) --estdb <EST db> Blast database containing EST sequences (default: empty) --size_read <INT> Size of reads (default: 100) --BDir <0|1|2> Orientation of reads (0: undirectional libraries, 1: TEs sequences in first read in pair, 2: TEs sequences in second read in pair) (default: 0) --size_insert <INT> Maximum size of insert tolerated between R1 and R2 for alignment on the reference genome (default: 250) --min_L1 <INT> Minimum number of bp matching for L1 mapping (default: 50) --mis_L1 <INT> Maximum number of mismatches tolerated for L1 mapping (default: 1) --min_unique <INT> Minimum number of consecutive bp not annotated by RepeatMasker (default: 33) --species <STRING> Species to use in RepeatMasker (default: human) --threads <INT> Number of threads (default: 1) For Blast database files, if a fasta is provided, the database can be built with '--build_[db]'. Otherwise, provide a path or URL. \"tar(.gz)\" files are acceptable, as well as wild card (rna*) URLs.