# HG changeset patch # User clifinder # Date 1503306949 14400 # Node ID d967366237612070e22281fb393c262a53a90d01 # Parent e8c0c34a3a68795475d704247ac39177fe4f650f Uploaded diff -r e8c0c34a3a68 -r d96736623761 LINE_html4.1.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/LINE_html4.1.pl Mon Aug 21 05:15:49 2017 -0400 @@ -0,0 +1,1101 @@ + #/usr/bin/perl + +################################################ +#Declaration of necessary libraries############# +################################################ + +use strict; +use warnings; +use Sys::CPU; +use Parallel::ForkManager; +use POSIX; +use Statistics::R ; +use Getopt::Long; +use File::Basename; + +################################################ +#Declaration of necessary global variables###### +################################################ + +my (@fastq1, @fastq2, @name, $html, $size_reads, $ref, $TE, $build_index, $build_TE, $html_repertory, $maxInsertSize, $prct, $help, $image, $Bdir, $minL1, $mis_L1); + +##################################################################### +#Definition options of execution according to the previous variables# +##################################################################### + +GetOptions ( +"first:s" => \@fastq1, +"second:s" => \@fastq2, +"name:s" => \@name, +"html:s" => \$html, +"TE:s" => \$TE, +"ref:s" => \$ref, +"build_TE" => \$build_TE, +"build_index" => \$build_index, +"pourcentage:i" => \$prct, +"size_insert:i" => \$maxInsertSize, +"size_read:i" => \$size_reads, +"html_path:s" => \$html_repertory, +"image:s" => \$image, +"BDir:i" => \$Bdir, +"minL1:i" => \$minL1, +"mis_L1:i" => \$mis_L1, +); +my $iprct = 100 - (($prct / $size_reads)*100) ; +my $mis_auth = $size_reads - $minL1 + $mis_L1 ; +my $eprct = ($iprct * $size_reads) /100; +my $dprct = ((100-$iprct) * $size_reads) / 100; + +################################################ +#Construct index of ref and TE if doesn't exist# +################################################ + +`(bwa index $ref) 2> /dev/null ` if ($build_index); +`(bwa index $TE) 2> /dev/null ` if ($build_TE); + +############################################ +#Create repository to store resulting files# +############################################ + +mkdir $html_repertory; + +###################################################### +#Define number of Cpu process we can use = total /5 # +###################################################### + +my $cpu = int(Sys::CPU::cpu_count() /5) ; + +########################################## +#Define hash # +########################################## + +my %frag_exp_id; + +########################## +#Data file we have to use# +########################## + +my $NCBI_est = $html_repertory.'/est_human'; # NCBI Human est +my $NCBI_rna = $html_repertory.'/rna'; # NCBI Human rna +my $rmsk = $html_repertory.'/rmsk.bed'; # UCSC repeat sequences +#################################### +# Redirection standart error output# +#################################### + +my $file = $html_repertory.'/report.txt'; +open STDERR, '>', $file or die "Can't redirect STDERR: $!"; + +############################## +# Analyse of each fastq file # +############################## + + +my @garbage; my $num = 0; +foreach my $tabR (0..$#fastq1) +{ + ################################################### + # Paired end mapping against L1 promoter sequences# + ################################################### + + print STDERR "alignement of $name[$tabR] to L1\n"; + my $sam = $html_repertory.'/'.$name[$tabR]."_L1.sam"; push(@garbage, $sam); + align_paired( $TE, $fastq1[$tabR], $fastq2[$tabR], $sam, $cpu, $mis_auth); + print STDERR "alignement done\n"; + + ################################################## + # Creation of two fastq for paired halfed mapped:# + # - _1 correspond to sequences mapped to L1 # + # - _2 correspond to sequences unmapped to L1 # + ################################################## + print STDERR "getting pairs with one mate matched to L1 and the other mate undetected by repeatmasker as a repeat sequence\n"; + + my $out_ASP_1 = $html_repertory.'/'.$name[$tabR]."_1.fastq"; push(@garbage, $out_ASP_1); + my $out_ASP_2 = $html_repertory.'/'.$name[$tabR]."_2.fastq"; push(@garbage, $out_ASP_2); + + ##split mate that matched to L1 and others## + my ($ASP_readsHashR, $half_num_out) = get_half($sam); + # $ASP_reads{$line[0]}[0] mapped - $ASP_reads{$line[0]}[1] unmapped + + ##pairs obtained after repeatmasker on the other mate## + my $left = sort_out($cpu, $out_ASP_1, $out_ASP_2, $ASP_readsHashR); + + print STDERR "number of half mapped pairs : $half_num_out\n"; + print STDERR "number of pairs after repeatmasker: $left\n"; + + ################################################## + # Alignment of halfed mapped pairs on genome # + ################################################## + print STDERR "alignement of potential chimeric sequences to the genome\n"; + $sam = $html_repertory.'/'.$name[$tabR]."_genome.sam"; + push(@garbage, $sam); + align_genome( $ref, $out_ASP_1, $out_ASP_2, $sam, $cpu); + print STDERR "alignement done\n"; + + ##compute the number of sequences obtained after alignement ## + + $left = `samtools view -Shc $sam 2> /dev/null`; + chomp $left; $left = $left/2; + print STDERR "number of sequences....: $left\n"; + + ################################################## + # Create bedfiles of potential chimerae # + # and Know repeat sequences removed # + ################################################## + + print STDERR "looking for chimerae\n"; + results($html_repertory, $sam, $name[$tabR], \%frag_exp_id, $num); + $num++; + +} + + + +##define files variables ## + +my $repfirst = $html_repertory.'/first.bed'; push(@garbage,$repfirst); +my $repsecond = $html_repertory.'/second.bed'; push(@garbage,$repsecond); +my $repMfirst = $html_repertory.'/firstM.bed'; push(@garbage,$repMfirst); +my $repMsecond = $html_repertory.'/secondM.bed'; push(@garbage,$repMsecond); +#my $covRepMsecond = $html_repertory.'/covSecondM.bed'; push(@garbage,$covRepMsecond); + + +##Concate all files for first and second mate results ## + +`cat $html_repertory/*-first.bed > $repfirst`; #*/ +`cat $html_repertory/*-second.bed > $repsecond`; #*/ + +## Sort Files and generate files that merge reads in the same locus ## +`bedtools sort -i $repfirst 2> /dev/null | bedtools merge -scores max -d 100 -s -nms -i /dev/stdin > $repMfirst 2> /dev/null`; +`bedtools sort -i $repsecond 2> /dev/null | bedtools merge -scores max -d 100 -s -nms -i /dev/stdin > $repMsecond 2> /dev/null`; + +my (%Gviz, %frag_uni, @second_R, @second_exp, @results); +my $merge_target = $html_repertory.'/target_merged.bed'; push(@garbage, $merge_target); +my $merge = $html_repertory.'/merged.bed'; push(@garbage, $merge); + +open (my $mT, ">".$merge_target) || die "cannot open $merge_target\n"; +open (my $m, ">".$merge) || die "cannot open $merge\n"; +open (my $in, $repMsecond) || die "cannot open secondM\n"; +my $cmp = 0; +while (<$in>) +{ + chomp $_; + my @tmp = (0) x scalar(@fastq1); + my @line = split /\t/, $_; + my @names =split /;/, $line[3]; + foreach my $n (@names){$n =~/(.*?)\/[12]/; $frag_uni{$1} = $cmp; $tmp[$frag_exp_id{$1}]++; } + $second_exp[$cmp] = \@tmp; + $cmp++; + push @second_R, [$line[0],$line[1],$line[2],$line[5]]; +} + +$cmp = 0; +open ($in, $repMfirst) || die "cannot open firstM\n"; +while (<$in>) +{ + chomp $_; + my %sec; + my @line = split /\t/, $_; + my @names =split /;/, $line[3]; + my @tmp = (0) x scalar(@fastq1); + foreach my $n (@names){$n =~/(.*?)\/[12]/; $tmp[$frag_exp_id{$1}]++; } + foreach my $n (@names) + { + $n =~/(.*?)\/[12]/; + unless (exists ($sec{$frag_uni{$1}}) ) + { + my @lmp = ($line[0], $line[1], $line[2], $line[5]); + foreach my $exp_N (@tmp){ push @lmp, $exp_N;} + push (@lmp, $second_R[$frag_uni{$1}]->[0], $second_R[$frag_uni{$1}]->[1], $second_R[$frag_uni{$1}]->[2], $second_R[$frag_uni{$1}]->[3]); + foreach my $exp_N (@{$second_exp[$frag_uni{$1}]}){ push @lmp, $exp_N;} + + my $name = $cmp.'-'.$second_R[$frag_uni{$1}]->[0].'-'.$second_R[$frag_uni{$1}]->[1].'-'.$second_R[$frag_uni{$1}]->[2]; + print $mT $second_R[$frag_uni{$1}]->[0]."\t".$second_R[$frag_uni{$1}]->[1]."\t".$second_R[$frag_uni{$1}]->[2]."\t$name\t29\t".$second_R[$frag_uni{$1}]->[3]."\n"; + + my ($b, $e) = (0,0); + if ($line[1] < $second_R[$frag_uni{$1}]->[1]) + { + $b = $line[1] - 1000; $e = $second_R[$frag_uni{$1}]->[2] + 1000; + $name = $cmp.'-'.$line[0].'-'.$b.'-'.$e; + print $m $line[0]."\t".$b."\t".$e."\t$name\t29\t".$second_R[$frag_uni{$1}]->[3]."\n"; + } + else + { + $b = $second_R[$frag_uni{$1}]->[1] - 1000; $e = $line[2] + 1000; + $name = $cmp.'-'.$line[0].'-'.$b.'-'.$e; + print $m $line[0]."\t".$b."\t".$e."\t$name\t29\t".$second_R[$frag_uni{$1}]->[3]."\n"; + } + $results[$cmp] = \@lmp; + $cmp++; + } + $sec{$frag_uni{$1}} = undef; + } +} +close $mT; close $m; + +my $fasta = $html_repertory.'/target_merged.fasta'; push(@garbage, $fasta); +my $extend = $html_repertory.'/extend.fasta'; push(@garbage, $extend); + +`bedtools getfasta -name -fi $ref -bed $merge -fo $extend`; +`bedtools getfasta -name -fi $ref -bed $merge_target -fo $fasta`; + +################################################ +#Blast against human rna and est # +################################################ + +##get databases for est and rna +`wget -r -nH -nd -np --accept=est* https://galaxy.gred-clermont.fr/clifinder/ -P $html_repertory `; +`wget -r -nH -nd -np --accept=rna* https://galaxy.gred-clermont.fr/clifinder/ -P $html_repertory `; + + +print STDERR "blast against human rna\n"; +my $tabular = $html_repertory."/chimerae_rna.tab"; push(@garbage, $tabular); +blast($NCBI_rna, $fasta, $tabular, $cpu); +my $rna = extract_blast($tabular); + +print STDERR "blast against human est\n"; +my $tabular2 = $html_repertory."/chimerae_est.tab";push(@garbage, $tabular2); +blast($NCBI_est, $fasta, $tabular, $cpu); +my $est = extract_blast($tabular); + +################################################ +#Create Results html file # +################################################ +print STDERR "save result in file\n"; +save_csv(); +print STDERR "create HTML\n"; +html_tab($rna,$est,$html_repertory); +$extend = $extend.'*'; +push(@garbage,glob($extend)); +unlink @garbage; +my $toErase = $html_repertory.'\rna*'; +unlink glob "$toErase"; +$toErase = $html_repertory.'\est*'; +unlink glob "$toErase"; + + +print STDERR "Job done!\n"; + +########################################### END MAIN ########################################################## + + +############################################################################################################## +############################################ FUNCTIONS ################################################# +############################################################################################################## + + + +############################################################ +##Function that aligned paired-end reads on a genome######## +############################################################ +## @param: # +## $index: referential genome # +## $fasq1: first paired end file # +## $fasq2: second paired end file # +## $sam: Alignment output file # +## $number_of_cpus: Number of Cpu used # +############################################################ + +sub align_genome +{ + my ($index, $fastq1, $fastq2, $sam, $number_of_cpus) = @_ ; + my @L_garbage =(); + my $sai1 = $sam."_temporary.sai1"; push @L_garbage,$sai1; + my $sai2 = $sam."_temporary.sai2"; push @L_garbage,$sai2; + `bwa aln -o4 -e1000 -t $number_of_cpus $index $fastq1 > $sai1 2> /dev/null`; + `bwa aln -o4 -e1000 -t $number_of_cpus $index $fastq2 > $sai2 2> /dev/null`; + ## -A force the insertion size + `bwa sampe -s -A -a $maxInsertSize $index $sai1 $sai2 $fastq1 $fastq2 2> /dev/null | samtools view -F4 -f 2 -Sh /dev/stdin > $sam`; + unlink @L_garbage; +} + + +############################################################ +##Function get_half get alignement on TE ### +############################################################ +## @param: # +## $sam: Name of alignement file # +## # +## @return: # +## $ASP_readsHashR: table to store sequences # +## $half_num_out: number of alignment saved # +############################################################ + +sub get_half +{ + ## store name of file + my $sam = shift; + open(my $fic, $sam) || die "cannot open sam file! $!\n"; ## Open file + my (%ASP_reads); my $cmp = 0; ## Declare variables for + my $sequence = ''; + + ##read file## + while(<$fic>) + { + chomp $_; + ##We don't consider lines of sam files that are in header## + next if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ); + ##We split line in several part## + my @line = split (/\t/,$_); + + ##Find if alignemets have min_L1 consecutives bases mapped on R1 ## + ##Corriger enlever la condition mis_L1 = 0 ## +# if ($mis_L1 != 0) +# { + if ($_ =~/NM:i:(\d+)\t.*MD:Z:(.*)/) + { + my $misT = $1; my $MD = $2; my $maxT = 0; + $MD = $1 if ($MD =~ /(.*)\t/); + $MD =~ s/\^//g; + my @tab = split /[ATCG]/,$MD; + my $tot = 0; + my $accept = 0; + if ($misT <= $mis_L1){$accept = 1;} + else + { + if ( $mis_L1 > scalar(@tab) ) { $maxT = scalar(@tab); } + else{ $maxT = $mis_L1; } + for (my $t = 0; $t < $maxT ; $t++) + { + $tot += $tab[$t] if $tab[$t] ne ''; + } + $accept = 1 if $tot >= $minL1; + } + ## if sequence is not accepted we go to the next sequence ## + next if $accept == 0; + } +# } + + ##looking for flag of the alignment and keep only good reads## + ##Find if it aligns on R1 or on R2## + + if ($line[1] == 73 || $line[1] == 89 || $line[1] == 117 || $line[1] == 69 || $line[1] == 133 || $line[1] == 181 || $line[1] == 153|| $line[1] == 137) + { + if ( $Bdir == 0 || ($Bdir == 1 && $line[1] & 64) || ($Bdir = 2 && $line[1] & 128)) + { + $cmp++; + $sequence = $line[9]; + ## if sequence is reversed aligned then reverse sequence ## + if ($line[1] & 16) + { + $sequence =reverse($sequence); + $sequence =~ tr/atgcuATGCU/tacgaTACGA/; + } + ## define table contains ## + $ASP_reads{$line[0]} = [undef,undef] unless exists( $ASP_reads{$line[0]} ); + + ##split if first mate (R1) is mapped on L1 or not (R2) ## + if ($line[1] & 8) + { + $ASP_reads{$line[0]}[0] = "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n"; + } + else + { + $ASP_reads{$line[0]}[1] = "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n"; + } + } + } + } + close $fic; + return ( \%ASP_reads, $cmp); +} + +############################################################ +##Function sort_out: extract paired end reads ### +############################################################ +## @param: # +## $cpus: number of Cpu used # +## $out1: output file accepted 1 # +## $out2: output file accepted 2 # +## $readsHashTabR: reads to consider # +############################################################ + +sub sort_out +{ + my ($cpus, $out1, $out2, $readsHashTabR) = @_; + my ($name,$path) = fileparse($out2,'.fastq'); + my %repeat; + my @garbage = (); my $cmp = 0; + my $repout = $html_repertory.'/'.$name."_repout/"; + my $fa = $html_repertory.'/'.$name.".fa"; push (@garbage,$fa ); + my $second = $html_repertory.'/'.$name."_temporary.fastq"; push (@garbage,$second); + mkdir $repout; + my %notLine; + + ##Write on file containing of readssHashTabR + + open(my $tmp, ">".$second) || die "cannot open temp file $second\n"; + while ( my ($k,$v) = each %{$readsHashTabR} ) + { + print $tmp ${$v}[1] if defined(${$v}[1]); + } + close $tmp; + + ## Transform fastq file to fasta + + `fastq_to_fasta -i $second -o $fa -Q33`; + + ##Launch RepeatMasker on fasta file + + `RepeatMasker -s -pa $cpus -dir $repout -species human $fa`; + my $repfile = $repout.$name.".fa.out"; + open (my $rep, $repfile) || die "cannot open $repfile $!\n"; + while(<$rep>) + { + chomp; + ## test the percent of repeats ## + my $string = $_; + $string =~ s/^\s+//; + next unless ($string =~ /^\d/); + $string =~ s/\s+/ /g; + my @line = split (' ',$string); + if ( exists($repeat{$line[4]}) ) + { + $repeat{$line[4]}->[0] = $line[5] if $repeat{$line[4]}->[0] > $line[5]; + $repeat{$line[4]}->[1] = $line[6] if $repeat{$line[4]}->[1] < $line[6]; + } + else{ $repeat{$line[4]} = [$line[5],$line[6]];} + } + close $rep; + + ## store in table if pair passed the repeat test ## + while (my ($k, $v) = each %repeat) + { + $notLine{$k} = 1 unless ($v->[0] > $dprct || $v->[1] < $eprct); + } + + ##write resulting reads in both files for paired ## + open(my $accepted_1, ">".$out1 ) || die "cannot open $out1 file $!\n"; + open(my $accepted_2, ">".$out2 ) || die "cannot open $out2 file $!\n"; + while ( my ($k,$v) = each %{$readsHashTabR} ) + { + if ( defined (${$v}[0]) && defined (${$v}[1]) ) + { + unless (defined ($notLine{$k}) && $notLine{$k} == 1) + { + $cmp++; + print $accepted_1 ${$v}[0]; + print $accepted_2 ${$v}[1]; + } + } + } + close $accepted_1; close $accepted_2; + + ##drop files and directories generated by repeatmasker## + my $toErase = $repout.'*'; + unlink glob "$toErase"; + unlink @garbage; rmdir $repout; + return $cmp; +} + +############################################################ +##Function that aligned paired-end reads on a referential### +############################################################ +## @param: # +## $index: referential file # +## $fasq1: first file paired end reads # +## $fasq2: second file paired end reads # +## $sam: output alignment file # +## $number_of_cpus: number of CPU used # +## $mis: tolerated mismatches # +############################################################ +sub align_paired +{ + my ($index, $fastq1, $fastq2, $sam, $number_of_cpus, $mis) = @_ ; + my @garbage = (); + my $sai1 = $sam."_temporary.sai1"; push @garbage,$sai1; + my $sai2 = $sam."_temporary.sai2"; push @garbage,$sai2; + + ##alignement with bwa + + `bwa aln -n $mis -t $number_of_cpus $index $fastq1 > $sai1 2> /dev/null`; + `bwa aln -n $mis -t $number_of_cpus $index $fastq2 > $sai2 2> /dev/null`; + `bwa sampe $index $sai1 $sai2 $fastq1 $fastq2 > $sam 2> /dev/null`; + + ## delete temporary single aligned files + unlink @garbage; +} + +############################################################ +##Function that aligned reads on a referential ### +############################################################ +## @param: # +## $index: referential file # +## $fasq: reads file # +## $sam: output alignment file # +## $number_of_cpus: number of CPU used # +############################################################ + +sub align +{ + my ($index, $fastq, $sam, $number_of_cpus ) = @_ ; + `bwa aln -o4 -e$maxInsertSize -t $number_of_cpus $index $fastq 2> /dev/null | bwa samse $index /dev/stdin $fastq > $sam 2> /dev/null`; +} + +############################################################ +##Function results computes bed files for result ### +############################################################ +## @param: # +## $out_repository: repository to store results # +## $file: sam file resulting of alignement # +## $name: name of paireds end reads file # +## $hashRef: store number for each first read value # +## $ps: number of the paired end file # +############################################################ + +sub results +{ + my ($out_repertory, $file, $name, $hashRef,$ps) = @_; + my $namefirst = $out_repertory.'/'.$name.'-first.bed'; push(@garbage, $namefirst); + my $namesecond = $out_repertory.'/'.$name.'-second.bed'; push(@garbage, $namesecond); + + ##get database forrepeatmasker + `wget https://galaxy.gred-clermont.fr/clifinder/rmsk.bed -P $out_repository `; push(@garbage, $rmsk); + + ## store reads mapped in proper pair respectively first and second in pair in bam files and transform in bed files## + `samtools view -Sb -f66 $file 2> /dev/null | bedtools bamtobed -i /dev/stdin > temp_name_first 2> /dev/null`; + `samtools view -Sb -f130 $file 2> /dev/null | bedtools bamtobed -i /dev/stdin > temp_name_second 2> /dev/null`; + + ##compute converage of second mate on rmsk## + my $baseCov = 0; + my %IdCov = (); + my @coverage = `bedtools coverage -b temp_name_second -a $rmsk`; + + + ## store coverage fraction ## + foreach my $covRmsk (@coverage) + { + chomp $covRmsk; + my @split_cov = split /\t/, $covRmsk; + ##generate identifier for IdCov ## + $split_cov[3] =~ /(.*?)\/[12]/; + ##store value in IdCov ## + if (!exists($IdCov{$1})) + { + $IdCov{$1} = $split_cov[-1]; + } + else + { + IdCov{$1} = $split_cov[-1] if $split_cov[-1] > IdCov{$1}; + } + } + + ## get only first mate that have less tant $iprct repeats ## + open (my $tmp_fi, 'temp_name_first') || die "cannot open $namefirst!\n"; + open (my $nam_fi, ">".$namefirst) || die "cannot open $namefirst!\n"; + while (<$tmp_fi>) + { + my @line = split /\t/, $_; + $line[3] =~ /(.*?)\/[12]/; + + if ($IdCov{$1} <= $iprct/100) + { + print $nam_fi $_; + + ${$hashRef}{$1}= $ps; + } + } + close $tmp_fi; close $nam_fi; + + + ## get only second mate that have less than $iprct repeats ## + + open (my $tmp_sec, 'temp_name_second') || die "cannot open $namesecond!\n"; + open (my $nam_sec, ">".$namesecond) || die "cannot open $namesecond!\n"; + while (<$tmp_sec>) + { + my @line = split /\t/, $_; + $line[3] =~ /(.*?)\/[12]/; + if ($IdCov{$1} <= $iprct/100) + { + print $nam_sec $_; + } + } + close $tmp_sec; close $nam_sec; +} + +#sub results +#{ +# my ($out_repertory, $file, $name, $hashRef,$ps) = @_; +# my $namefirst = $out_repertory.'/'.$name.'-first.bed'; push(@garbage, $namefirst); +# my $namesecond = $out_repertory.'/'.$name.'-second.bed'; push(@garbage, $namesecond); +# `samtools view -Sb -f66 $file 2> /dev/null | bedtools bamtobed -i /dev/stdin > $namefirst 2> /dev/null`; +# `samtools view -Sb -f130 $file 2> /dev/null | bedtools bamtobed -i /dev/stdin > $namesecond 2> /dev/null`; +# open( my $in, $out_repertory.'/'.$name.'-first.bed') || die "cannot open first read bed\n"; +# while (<$in>) +# { +# my @line = split /\t/, $_; +# $line[3] =~ /(.*?)\/1/; +# ${$hashRef}{$1}= $ps; +# } +#} + + +############################################################ +##Function blast: blast nucleotide sequences on ref ### +############################################################ +## @param: # +## $db:database where to search # +## $fasta: file containing nucleotide sequences # +## $tabular: out file name # +## $cpu: Number of Cpu used # +############################################################ + + + +sub blast +{ + my ($db, $fasta, $tabular, $cpus) = @_; + `blastn -db $db -query $fasta -num_threads $cpus -out $tabular -outfmt 6 -evalue 10e-10`; +} + +############################################################ +##Function extract_blast: extract result from blast ### +############################################################ +## @param: # +## $file: Name of sequences file # +## @return: hash that contains sequences # +############################################################ + + +sub extract_blast +{ + my $file = shift; + my %hash = (); + open (my $f, $file) || die "cannot open $file\n"; + while (<$f>) + { + chomp $_; + my ($seq,$id) = split /\t/,$_; + $seq = $1 if ($seq =~ /(\d+)-(.*?)-(\d+)-(\d+)/); + $hash{$seq} = [] unless exists $hash{$seq}; + push @{$hash{$seq}}, $id; + } + close $f; + return \%hash; +} + +############################################################ +##Function print_header: header of html file ### +############################################################ +## @param: # +############################################################ + + + +sub print_header +{ + my $fileR = shift; my $title = shift; + print $fileR " $title"; + print $fileR "\n"; + print $fileR " \n"; + print $fileR ""; +print $fileR " \n"; +} + +############################################################ +##Function html_tab: definition of html file ### +############################################################ +## @param: # +## $html_repository: repository to store results # +## $rna: results for know RNA # +## $est: results for known EST # +############################################################ + +sub html_tab +{ + my ($rna,$est) = @_; + my $out = $html_repertory; + + `wget https://galaxy.gred-clermont.fr/clifinder/arrows.png -P $out && wget https://galaxy.gred-clermont.fr/clifinder/row_bkg.png -P $out && wget https://galaxy.gred-clermont.fr/clifinder/jquery.min.js -P $out`; + my $chimOut = $html; + + open(my $tab, ">".$chimOut) || die "cannot open $chimOut"; + print_header($tab,"Chimerae"); + print $tab " + + + + "; + for my $i (0..$#fastq1) + { + print $tab "\t\n"; + } + print $tab " + + + + "; + for my $i (0..$#fastq1) + { + print $tab " \n"; + } + print $tab "\t + \t\n\t\n"; + + for my $i (0..$#results) + { + print $tab ""; + foreach my $j (@{$results[$i]}) + { + print $tab " "; + } + my ($Hrna, $Hest) = ('',''); + $Hrna = ${$rna}{$i}[0] if exists(${$rna}{$i}); + $Hest = ${$est}{$i}[0] if exists(${$est}{$i}); + my $Lrna ='link break'; my $Lest = 'link break'; + chomp $Hrna; chomp $Hest; + $Lrna = $3 if $Hrna =~/gi\|(.*?)\|(.*?)\|(.*)\|$/; + $Lest = $3 if $Hest =~/gi\|(.*?)\|(.*?)\|(.*)\|$/; + print $tab "\t\n"; + print $tab "\t\n"; + print $tab "\t\n\n"; + my $img = 'link break'; + $img = $i.'.svg'; + my $colspan = scalar(@fastq1) * 2 + 8 ; + print $tab "\n\t\n\t\n\t\n\t\n\n"; + } + print $tab qw{ +
L1 chromosomeL1 startL1 endL1 strand$name[$i] read #Chimera chromosomeChimera startChimera endChimera strand$name[$i] read #Known RNAKnown EST
$j$Hrna$Hest
"; + if (exists(${$rna}{$i})) + { + for (my $w = 1; $w <= $#{${$rna}{$i}}; $w++) + { + $Hrna = ''; + $Hrna = ${$rna}{$i}[$w]; + $Lrna ='link break'; + chomp $Hrna; + $Lrna = $3 if $Hrna =~/gi\|(.*?)\|(.*?)\|(.*)\|$/; + print $tab "$Hrna
\n"; + } + delete ${$rna}{$i}; + } + print $tab "
"; + if (exists (${$est}{$i})) + { + for (my $w = 1; $w <= $#{${$est}{$i}}; $w++) + { + $Hest = ''; + $Hest = ${$est}{$i}[$w]; + chomp $Hest; + $Lest ='link break'; + $Lest = $3 if $Hest =~/gi\|(.*?)\|(.*?)\|(.*)\|$/; + print $tab "\t$Hest
\n"; + } + delete ${$est}{$i}; + } + print $tab "
+ }; + print $tab "Report"; + print $tab qw{ + + + }; + close $tab; +} + +############################################################ +##Function save_csv: save results in different formats ### +############################################################ +## @param: # +############################################################ +sub save_csv{ + + my $out = $html_repertory; + my $Line_only=$html_repertory.'/'.'Line_only_hg19.txt.gz'; push(@garbage, $Line_only); #Line Only H19 database + my $Hg19_refseq=$html_repertory.'/'.'hg19_refseq.bed'; push(@garbage, $Hg19_refseq);#h19 refseq bed file + my $out1= $html_repertory.'/results.txt'; + my $out2= $html_repertory.'/first_results.txt'; + my $out3 =$html_repertory.'/final_result_chimerae.txt'; + + #load databases needed + + `wget https://galaxy.gred-clermont.fr/clifinder/Line_only_hg19.txt.gz -P $out`; + `wget https://galaxy.gred-clermont.fr/clifinder/hg19_refseq.bed -P $out `; + + + # save result in csv file ## + + my $filed = $out1; + open(my $tab, ">".$filed) || die "cannot open $file"; + print $tab "L1 chromosome \t L1 start \t L1 end \t L1 strand";; + for my $i (0..$#fastq1) + { + print $tab "\t $name[$i] read #"; + } + print $tab "\t Chimera chromosome\t Chimera start \t Chimera end \t Chimera strand"; + for my $i (0..$#fastq1) + { + print $tab "\t $name[$i] read #"; + } + print $tab "\n"; + for my $i ( 0 .. $#results ) + { + my $rowref = $results[$i]; + my $n = @$rowref - 1; + for my $j ( 0 .. $n-1 ) + { + print $tab "$results[$i][$j]\t"; + } + print $tab "$results[$i][$n]\n"; + } + close $tab; + + ##Add some information via R Scripts## + + use Statistics::R; + # Create bridge between Perl and R + my $R = Statistics::R->new(); + $R->startR; + eval{ + $R->send( + qq ' + rm(list=ls()) + library(GenomicRanges) + library(plyr) + chim<-read.delim("$out1") + + chim<-chim[order(chim[,$#fastq1+7],decreasing=F),] + chim<-chim[order(chim[,2],decreasing=F),] + chr<-sub("chr","",as.character(chim[,1])) + chim<-chim[order(as.numeric(chr)),] + + grchim <- GRanges(seqnames = chim[,1], + IRanges(start = chim[,2], + end = chim[,3]),strand=chim[,4]) + + + grchim\$ID<-paste("Id_",1:length(chim[,1]),sep="") + + mcols(grchim)<-cbind(mcols(grchim),chim[,5:($#fastq1+9)]) + + + grfusR<- union(grchim,grchim) + + position<-as.data.frame(findOverlaps(grfusR,grchim)) + + grfusR\$dup<- table(position[,1]) + + position2<-as.data.frame(findOverlaps(grfusR[grfusR\$dup>1],grchim)) + + + grfusR2<-grfusR + strand(grfusR2)<- "+" + gr3<-union(grfusR2,grfusR2) + position3<- as.data.frame(findOverlaps(gr3,grfusR2)) + gr3\$dup<-table(position3[,1]) + + position3<-as.data.frame(findOverlaps(gr3[gr3\$dup>1],grfusR2)) + grfusR\$info<-"no" + grfusR\$info [position3[,2]]<-"overlap sens opposé" + + + + grfusR\$ID<-"Id" + grfusR\$ID[position[!duplicated(position[,1]),1]]<-grchim\$ID[position[!duplicated(position[,1]),2]] + + if(nrow(position2)!=0) + { + result <- aggregate(position2[,2] ~ position2[,1], data = position2, paste, collapse = "_") + grfusR\$ID[grfusR\$dup>1]<-paste("ID",result[,2],sep="_") + } + + mcols(grfusR)<-cbind(mcols(grfusR), mcols(grchim[position[!duplicated(position[,1]),2]])) + + + + min<-ddply(as.data.frame(grchim), .(seqnames,end,strand), function(x)x[x\$Chimera.start==min(x\$Chimera.start),]) + min<-ddply(as.data.frame(min), .(seqnames,start,strand), function(x)x[x\$Chimera.start==min(x\$Chimera.start),]) + max<-ddply(as.data.frame(grchim), .(seqnames,end,strand), function(x)x[x\$Chimera.end==max(x\$Chimera.end),]) + max<-ddply(as.data.frame(max), .(seqnames,start,strand), function(x)x[x\$Chimera.end==max(x\$Chimera.end),]) + + grfusR<-as.data.frame(grfusR) + grfusR<-grfusR[order(grfusR[,1],grfusR[,2],grfusR[,3],grfusR[,4],decreasing=F),] + + grfusR\$Chimera.start<- min\$Chimera.start + grfusR\$Chimera.end<-max\$Chimera.end + + + datax<-as.data.frame(grfusR) + colnames(datax)[1:3]<-colnames(chim)[1:3] + colnames(datax)[5]<-colnames(chim)[4] + + + grchim2 <- GRanges(seqnames = datax[,$#fastq1+12], + IRanges(start = datax[,$#fastq1+13], + end = datax[,$#fastq1+14]),strand=datax[,$#fastq1+15]) + mcols(grchim2)<-datax[,-c(4,$#fastq1+12:$#fastq1+15)] + + + + grfus<- union(grchim2,grchim2) + + position<-as.data.frame(findOverlaps(grfus,grchim2)) + + grfus\$dup<- table(position[,1]) + + position2<-as.data.frame(findOverlaps(grfus[grfus\$dup>1],grchim2)) + + grchim2[ position2[,2] ] + + + + + grfus\$ID_final<-"Id" + grfus\$ID_final[position[!duplicated(position[,1]),1]]<-grchim2\$ID[position[!duplicated(position[,1]),2]] + + if(nrow(position2)!=0) + { + result <- aggregate(position2[,2] ~ position2[,1], data = position2, paste, collapse = "_") + grfus\$ID_final[grfus\$dup>1]<-paste("Id",result[,2],sep="_") + } + + + + mcols(grfus)<-cbind(mcols(grfus), mcols(grchim2[position[!duplicated(position[,1]),2]])) + + for (i in 0:$#fastq1) + { + mcols(grfus)[grfus\$dup>1,12+i] <- mcols(grfus)[grfus\$dup>1,12+i] + mcols(grchim2)[position[duplicated(position[,1]),2],10+i] + } + + + grfus2<-grfus + strand(grfus2)<-"+" + gr3<-union(grfus2,grfus2) + + position3<- as.data.frame(findOverlaps(gr3,grfus2)) + gr3\$dup<-table(position3[,1]) + + position3<-as.data.frame(findOverlaps(gr3[gr3\$dup>1],grfus2)) + + grfus\$info [position3[,2]]<-"overlap sens opposé" + + min<-ddply(as.data.frame(grchim2), .(seqnames,end,strand), function(x)x[x\$L1.start==min(x\$L1.start),]) + min<-ddply(data.frame(min), .(seqnames,start,strand), function(x)x[x\$L1.start==min(x\$L1.start),]) + max<-ddply(as.data.frame(grchim2), .(seqnames,end,strand), function(x)x[x\$L1.end==max(x\$L1.end),]) + max<-ddply(data.frame(max), .(seqnames,start,strand), function(x)x[x\$L1.end==max(x\$L1.end),]) + + grfus1<-as.data.frame(grfus) + grfus1<-grfus1[order(grfus1[,1],grfus1[,2],grfus1[,3],grfus1[,4],decreasing=F),] + + grfus1\$L1.start<- min\$L1.start + grfus1\$L1.end<-max\$L1.end + + dataf<-as.data.frame(grfus1) + + result<-( data.frame("Chimera.Chr"= dataf\$L1.chromosome, "Chimera.Start"=apply(data.frame( dataf\$start,dataf\$end,dataf\$L1.start,dataf\$L1.end ),1,min) , "Chimera.End"= apply(data.frame( dataf\$start,dataf\$end,dataf\$L1.start,dataf\$L1.end ),1,max) ,"Chimera.Strand"=dataf\$L1.strand ,"L1.Chr"= dataf\$L1.chromosome, "L1.Start"=dataf\$L1.start ,"L1.End"= dataf\$L1.end , "L1.Strand"=dataf\$L1.strand , "Unique.Chr"= dataf\$seqnames, "Unique.Start"=dataf\$start , "Unique.End"= dataf\$end , "Unique.Strand"=dataf\$strand , "ID_final"=dataf\$ID_final,"info"=dataf\$info, dataf[,18:($#fastq1+18)] ) ) + + result<-result[order(result[,2],decreasing=F),] + chr<-sub("chr","",as.character(result[,1])) + result<-result[order(as.numeric(chr)),] + options(scipen=10) + write.table(result,"$out2",sep="\t",row.names = F,quote = F) + grchim <- GRanges(seqnames = result\$L1.Chr, + IRanges(start = result\$L1.Start, + end = result\$L1.End),strand=result\$L1.Strand) + mcols(grchim)<-result + + Rep<-read.delim("$Line_only",skip=1) + + Gene<-read.delim("$Hg19_refseq") + grLINE <- GRanges(seqnames = Rep\$genoName, + IRanges(start = Rep\$genoStart, + end = Rep\$genoEnd), + repStrand=as.character(Rep\$strand), + repName =as.character(Rep\$repName)) + + grGene <- GRanges(seqnames = Gene\$chrom, + IRanges(start = Gene\$txStart, + end = Gene\$txEnd), + geneStrand=as.character(Gene\$strand), + geneName = as.character(Gene\$name2)) + + position<-as.data.frame(findOverlaps(grchim,grLINE)) + + position2<-as.data.frame(findOverlaps(grchim,grGene)) + + table(grLINE\$repName[position[,2]]) + write.table(table(grLINE\$repName[position[,2]])) + + grchim\$GeneName<-"no_gene" + + grchim\$GeneName[position2[,1]]<- grGene\$geneName[position2[,2]] + + grchim\$GeneStrand<-"*" + + grchim\$GeneStrand[position2[,1]]<- grLINE\$repStrand[position2[,2]] + + grchim\$repName<-"no" + + grchim\$repName[position[,1]]<- grLINE\$repName[position[,2]] + + grchim\$repStart<-0 + + grchim\$repStart[position[,1]]<-start(grLINE[position[,2]]) + + grchim\$repEnd<-0 + + grchim\$repEnd[position[,1]]<-end(grLINE[position[,2]]) + + grchim\$repWidth<-0 + + grchim\$repWidth[position[,1]]<-width(grLINE[position[,2]]) + + grchim\$repStrand<-"*" + + grchim\$repStrand[position[,1]]<- grLINE\$repStrand[position[,2]] + + dup<-position[duplicated(position[,1]),1] + if(length(dup != 0)) + { + for (i in 1:length(dup)) + { + + grchim\$repName[dup[i]] <-paste(grLINE\$repName[position[position[,1]==dup[i],2]],collapse="/") + grchim\$repStart[dup[i]] <-paste(start(grLINE[position[position[,1]==dup[i],2]]),collapse="/") + grchim\$repEnd[dup[i]] <-paste(end(grLINE[position[position[,1]==dup[i],2]]),collapse="/") + grchim\$repWidth[dup[i]] <-paste(width(grLINE[position[position[,1]==dup[i],2]]),collapse="/") + grchim\$repStrand[dup[i]] <-paste(grLINE\$repStrand[position[position[,1]==dup[i],2]],collapse="/") + + } + } + + final_result<-as.data.frame(grchim) + options(scipen=10) + write.table(final_result[,-c(1:5)],"$out3",sep="\t",row.names = F,quote = F) + ' + ); + }; + + $R->stop(); + unlink @garbage; + + + + +} + +