Mercurial > repos > clifinder > clifinder
changeset 3:4cab6b82be6f draft
Uploaded
author | clifinder |
---|---|
date | Mon, 21 Aug 2017 05:13:22 -0400 |
parents | d85b4a7aaa59 |
children | e5d1695fad54 |
files | LINE_html4.1.pl LINE_html_4_1.xml tool-data/._bwa_index.loc tool-data/bwa_index.loc tool_data_table_conf.xml |
diffstat | 5 files changed, 14 insertions(+), 1219 deletions(-) [+] |
line wrap: on
line diff
--- a/LINE_html4.1.pl Mon Aug 21 05:06:56 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1101 +0,0 @@ - #/usr/bin/perl - -################################################ -#Declaration of necessary libraries############# -################################################ - -use strict; -use warnings; -use Sys::CPU; -use Parallel::ForkManager; -use POSIX; -use Statistics::R ; -use Getopt::Long; -use File::Basename; - -################################################ -#Declaration of necessary global variables###### -################################################ - -my (@fastq1, @fastq2, @name, $html, $size_reads, $ref, $TE, $build_index, $build_TE, $html_repertory, $maxInsertSize, $prct, $help, $image, $Bdir, $minL1, $mis_L1); - -##################################################################### -#Definition options of execution according to the previous variables# -##################################################################### - -GetOptions ( -"first:s" => \@fastq1, -"second:s" => \@fastq2, -"name:s" => \@name, -"html:s" => \$html, -"TE:s" => \$TE, -"ref:s" => \$ref, -"build_TE" => \$build_TE, -"build_index" => \$build_index, -"pourcentage:i" => \$prct, -"size_insert:i" => \$maxInsertSize, -"size_read:i" => \$size_reads, -"html_path:s" => \$html_repertory, -"image:s" => \$image, -"BDir:i" => \$Bdir, -"minL1:i" => \$minL1, -"mis_L1:i" => \$mis_L1, -); -my $iprct = 100 - (($prct / $size_reads)*100) ; -my $mis_auth = $size_reads - $minL1 + $mis_L1 ; -my $eprct = ($iprct * $size_reads) /100; -my $dprct = ((100-$iprct) * $size_reads) / 100; - -################################################ -#Construct index of ref and TE if doesn't exist# -################################################ - -`(bwa index $ref) 2> /dev/null ` if ($build_index); -`(bwa index $TE) 2> /dev/null ` if ($build_TE); - -############################################ -#Create repository to store resulting files# -############################################ - -mkdir $html_repertory; - -###################################################### -#Define number of Cpu process we can use = total /5 # -###################################################### - -my $cpu = int(Sys::CPU::cpu_count() /5) ; - -########################################## -#Define hash # -########################################## - -my %frag_exp_id; - -########################## -#Data file we have to use# -########################## - -my $NCBI_est = $html_repertory.'/est_human'; # NCBI Human est -my $NCBI_rna = $html_repertory.'/rna'; # NCBI Human rna -my $rmsk = $html_repertory.'/rmsk.bed'; # UCSC repeat sequences -#################################### -# Redirection standart error output# -#################################### - -my $file = $html_repertory.'/report.txt'; -open STDERR, '>', $file or die "Can't redirect STDERR: $!"; - -############################## -# Analyse of each fastq file # -############################## - - -my @garbage; my $num = 0; -foreach my $tabR (0..$#fastq1) -{ - ################################################### - # Paired end mapping against L1 promoter sequences# - ################################################### - - print STDERR "alignement of $name[$tabR] to L1\n"; - my $sam = $html_repertory.'/'.$name[$tabR]."_L1.sam"; push(@garbage, $sam); - align_paired( $TE, $fastq1[$tabR], $fastq2[$tabR], $sam, $cpu, $mis_auth); - print STDERR "alignement done\n"; - - ################################################## - # Creation of two fastq for paired halfed mapped:# - # - _1 correspond to sequences mapped to L1 # - # - _2 correspond to sequences unmapped to L1 # - ################################################## - print STDERR "getting pairs with one mate matched to L1 and the other mate undetected by repeatmasker as a repeat sequence\n"; - - my $out_ASP_1 = $html_repertory.'/'.$name[$tabR]."_1.fastq"; push(@garbage, $out_ASP_1); - my $out_ASP_2 = $html_repertory.'/'.$name[$tabR]."_2.fastq"; push(@garbage, $out_ASP_2); - - ##split mate that matched to L1 and others## - my ($ASP_readsHashR, $half_num_out) = get_half($sam); - # $ASP_reads{$line[0]}[0] mapped - $ASP_reads{$line[0]}[1] unmapped - - ##pairs obtained after repeatmasker on the other mate## - my $left = sort_out($cpu, $out_ASP_1, $out_ASP_2, $ASP_readsHashR); - - print STDERR "number of half mapped pairs : $half_num_out\n"; - print STDERR "number of pairs after repeatmasker: $left\n"; - - ################################################## - # Alignment of halfed mapped pairs on genome # - ################################################## - print STDERR "alignement of potential chimeric sequences to the genome\n"; - $sam = $html_repertory.'/'.$name[$tabR]."_genome.sam"; - push(@garbage, $sam); - align_genome( $ref, $out_ASP_1, $out_ASP_2, $sam, $cpu); - print STDERR "alignement done\n"; - - ##compute the number of sequences obtained after alignement ## - - $left = `samtools view -Shc $sam 2> /dev/null`; - chomp $left; $left = $left/2; - print STDERR "number of sequences....: $left\n"; - - ################################################## - # Create bedfiles of potential chimerae # - # and Know repeat sequences removed # - ################################################## - - print STDERR "looking for chimerae\n"; - results($html_repertory, $sam, $name[$tabR], \%frag_exp_id, $num); - $num++; - -} - - - -##define files variables ## - -my $repfirst = $html_repertory.'/first.bed'; push(@garbage,$repfirst); -my $repsecond = $html_repertory.'/second.bed'; push(@garbage,$repsecond); -my $repMfirst = $html_repertory.'/firstM.bed'; push(@garbage,$repMfirst); -my $repMsecond = $html_repertory.'/secondM.bed'; push(@garbage,$repMsecond); -#my $covRepMsecond = $html_repertory.'/covSecondM.bed'; push(@garbage,$covRepMsecond); - - -##Concate all files for first and second mate results ## - -`cat $html_repertory/*-first.bed > $repfirst`; #*/ -`cat $html_repertory/*-second.bed > $repsecond`; #*/ - -## Sort Files and generate files that merge reads in the same locus ## -`bedtools sort -i $repfirst 2> /dev/null | bedtools merge -scores max -d 100 -s -nms -i /dev/stdin > $repMfirst 2> /dev/null`; -`bedtools sort -i $repsecond 2> /dev/null | bedtools merge -scores max -d 100 -s -nms -i /dev/stdin > $repMsecond 2> /dev/null`; - -my (%Gviz, %frag_uni, @second_R, @second_exp, @results); -my $merge_target = $html_repertory.'/target_merged.bed'; push(@garbage, $merge_target); -my $merge = $html_repertory.'/merged.bed'; push(@garbage, $merge); - -open (my $mT, ">".$merge_target) || die "cannot open $merge_target\n"; -open (my $m, ">".$merge) || die "cannot open $merge\n"; -open (my $in, $repMsecond) || die "cannot open secondM\n"; -my $cmp = 0; -while (<$in>) -{ - chomp $_; - my @tmp = (0) x scalar(@fastq1); - my @line = split /\t/, $_; - my @names =split /;/, $line[3]; - foreach my $n (@names){$n =~/(.*?)\/[12]/; $frag_uni{$1} = $cmp; $tmp[$frag_exp_id{$1}]++; } - $second_exp[$cmp] = \@tmp; - $cmp++; - push @second_R, [$line[0],$line[1],$line[2],$line[5]]; -} - -$cmp = 0; -open ($in, $repMfirst) || die "cannot open firstM\n"; -while (<$in>) -{ - chomp $_; - my %sec; - my @line = split /\t/, $_; - my @names =split /;/, $line[3]; - my @tmp = (0) x scalar(@fastq1); - foreach my $n (@names){$n =~/(.*?)\/[12]/; $tmp[$frag_exp_id{$1}]++; } - foreach my $n (@names) - { - $n =~/(.*?)\/[12]/; - unless (exists ($sec{$frag_uni{$1}}) ) - { - my @lmp = ($line[0], $line[1], $line[2], $line[5]); - foreach my $exp_N (@tmp){ push @lmp, $exp_N;} - push (@lmp, $second_R[$frag_uni{$1}]->[0], $second_R[$frag_uni{$1}]->[1], $second_R[$frag_uni{$1}]->[2], $second_R[$frag_uni{$1}]->[3]); - foreach my $exp_N (@{$second_exp[$frag_uni{$1}]}){ push @lmp, $exp_N;} - - my $name = $cmp.'-'.$second_R[$frag_uni{$1}]->[0].'-'.$second_R[$frag_uni{$1}]->[1].'-'.$second_R[$frag_uni{$1}]->[2]; - print $mT $second_R[$frag_uni{$1}]->[0]."\t".$second_R[$frag_uni{$1}]->[1]."\t".$second_R[$frag_uni{$1}]->[2]."\t$name\t29\t".$second_R[$frag_uni{$1}]->[3]."\n"; - - my ($b, $e) = (0,0); - if ($line[1] < $second_R[$frag_uni{$1}]->[1]) - { - $b = $line[1] - 1000; $e = $second_R[$frag_uni{$1}]->[2] + 1000; - $name = $cmp.'-'.$line[0].'-'.$b.'-'.$e; - print $m $line[0]."\t".$b."\t".$e."\t$name\t29\t".$second_R[$frag_uni{$1}]->[3]."\n"; - } - else - { - $b = $second_R[$frag_uni{$1}]->[1] - 1000; $e = $line[2] + 1000; - $name = $cmp.'-'.$line[0].'-'.$b.'-'.$e; - print $m $line[0]."\t".$b."\t".$e."\t$name\t29\t".$second_R[$frag_uni{$1}]->[3]."\n"; - } - $results[$cmp] = \@lmp; - $cmp++; - } - $sec{$frag_uni{$1}} = undef; - } -} -close $mT; close $m; - -my $fasta = $html_repertory.'/target_merged.fasta'; push(@garbage, $fasta); -my $extend = $html_repertory.'/extend.fasta'; push(@garbage, $extend); - -`bedtools getfasta -name -fi $ref -bed $merge -fo $extend`; -`bedtools getfasta -name -fi $ref -bed $merge_target -fo $fasta`; - -################################################ -#Blast against human rna and est # -################################################ - -##get databases for est and rna -`wget -r -nH -nd -np --accept=est* https://galaxy.gred-clermont.fr/clifinder/ -P $html_repertory `; -`wget -r -nH -nd -np --accept=rna* https://galaxy.gred-clermont.fr/clifinder/ -P $html_repertory `; - - -print STDERR "blast against human rna\n"; -my $tabular = $html_repertory."/chimerae_rna.tab"; push(@garbage, $tabular); -blast($NCBI_rna, $fasta, $tabular, $cpu); -my $rna = extract_blast($tabular); - -print STDERR "blast against human est\n"; -my $tabular2 = $html_repertory."/chimerae_est.tab";push(@garbage, $tabular2); -blast($NCBI_est, $fasta, $tabular, $cpu); -my $est = extract_blast($tabular); - -################################################ -#Create Results html file # -################################################ -print STDERR "save result in file\n"; -save_csv(); -print STDERR "create HTML\n"; -html_tab($rna,$est,$html_repertory); -$extend = $extend.'*'; -push(@garbage,glob($extend)); -unlink @garbage; -my $toErase = $html_repertory.'\rna*'; -unlink glob "$toErase"; -$toErase = $html_repertory.'\est*'; -unlink glob "$toErase"; - - -print STDERR "Job done!\n"; - -########################################### END MAIN ########################################################## - - -############################################################################################################## -############################################ FUNCTIONS ################################################# -############################################################################################################## - - - -############################################################ -##Function that aligned paired-end reads on a genome######## -############################################################ -## @param: # -## $index: referential genome # -## $fasq1: first paired end file # -## $fasq2: second paired end file # -## $sam: Alignment output file # -## $number_of_cpus: Number of Cpu used # -############################################################ - -sub align_genome -{ - my ($index, $fastq1, $fastq2, $sam, $number_of_cpus) = @_ ; - my @L_garbage =(); - my $sai1 = $sam."_temporary.sai1"; push @L_garbage,$sai1; - my $sai2 = $sam."_temporary.sai2"; push @L_garbage,$sai2; - `bwa aln -o4 -e1000 -t $number_of_cpus $index $fastq1 > $sai1 2> /dev/null`; - `bwa aln -o4 -e1000 -t $number_of_cpus $index $fastq2 > $sai2 2> /dev/null`; - ## -A force the insertion size - `bwa sampe -s -A -a $maxInsertSize $index $sai1 $sai2 $fastq1 $fastq2 2> /dev/null | samtools view -F4 -f 2 -Sh /dev/stdin > $sam`; - unlink @L_garbage; -} - - -############################################################ -##Function get_half get alignement on TE ### -############################################################ -## @param: # -## $sam: Name of alignement file # -## # -## @return: # -## $ASP_readsHashR: table to store sequences # -## $half_num_out: number of alignment saved # -############################################################ - -sub get_half -{ - ## store name of file - my $sam = shift; - open(my $fic, $sam) || die "cannot open sam file! $!\n"; ## Open file - my (%ASP_reads); my $cmp = 0; ## Declare variables for - my $sequence = ''; - - ##read file## - while(<$fic>) - { - chomp $_; - ##We don't consider lines of sam files that are in header## - next if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ); - ##We split line in several part## - my @line = split (/\t/,$_); - - ##Find if alignemets have min_L1 consecutives bases mapped on R1 ## - ##Corriger enlever la condition mis_L1 = 0 ## -# if ($mis_L1 != 0) -# { - if ($_ =~/NM:i:(\d+)\t.*MD:Z:(.*)/) - { - my $misT = $1; my $MD = $2; my $maxT = 0; - $MD = $1 if ($MD =~ /(.*)\t/); - $MD =~ s/\^//g; - my @tab = split /[ATCG]/,$MD; - my $tot = 0; - my $accept = 0; - if ($misT <= $mis_L1){$accept = 1;} - else - { - if ( $mis_L1 > scalar(@tab) ) { $maxT = scalar(@tab); } - else{ $maxT = $mis_L1; } - for (my $t = 0; $t < $maxT ; $t++) - { - $tot += $tab[$t] if $tab[$t] ne ''; - } - $accept = 1 if $tot >= $minL1; - } - ## if sequence is not accepted we go to the next sequence ## - next if $accept == 0; - } -# } - - ##looking for flag of the alignment and keep only good reads## - ##Find if it aligns on R1 or on R2## - - if ($line[1] == 73 || $line[1] == 89 || $line[1] == 117 || $line[1] == 69 || $line[1] == 133 || $line[1] == 181 || $line[1] == 153|| $line[1] == 137) - { - if ( $Bdir == 0 || ($Bdir == 1 && $line[1] & 64) || ($Bdir = 2 && $line[1] & 128)) - { - $cmp++; - $sequence = $line[9]; - ## if sequence is reversed aligned then reverse sequence ## - if ($line[1] & 16) - { - $sequence =reverse($sequence); - $sequence =~ tr/atgcuATGCU/tacgaTACGA/; - } - ## define table contains ## - $ASP_reads{$line[0]} = [undef,undef] unless exists( $ASP_reads{$line[0]} ); - - ##split if first mate (R1) is mapped on L1 or not (R2) ## - if ($line[1] & 8) - { - $ASP_reads{$line[0]}[0] = "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n"; - } - else - { - $ASP_reads{$line[0]}[1] = "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n"; - } - } - } - } - close $fic; - return ( \%ASP_reads, $cmp); -} - -############################################################ -##Function sort_out: extract paired end reads ### -############################################################ -## @param: # -## $cpus: number of Cpu used # -## $out1: output file accepted 1 # -## $out2: output file accepted 2 # -## $readsHashTabR: reads to consider # -############################################################ - -sub sort_out -{ - my ($cpus, $out1, $out2, $readsHashTabR) = @_; - my ($name,$path) = fileparse($out2,'.fastq'); - my %repeat; - my @garbage = (); my $cmp = 0; - my $repout = $html_repertory.'/'.$name."_repout/"; - my $fa = $html_repertory.'/'.$name.".fa"; push (@garbage,$fa ); - my $second = $html_repertory.'/'.$name."_temporary.fastq"; push (@garbage,$second); - mkdir $repout; - my %notLine; - - ##Write on file containing of readssHashTabR - - open(my $tmp, ">".$second) || die "cannot open temp file $second\n"; - while ( my ($k,$v) = each %{$readsHashTabR} ) - { - print $tmp ${$v}[1] if defined(${$v}[1]); - } - close $tmp; - - ## Transform fastq file to fasta - - `fastq_to_fasta -i $second -o $fa -Q33`; - - ##Launch RepeatMasker on fasta file - - `RepeatMasker -s -pa $cpus -dir $repout -species human $fa`; - my $repfile = $repout.$name.".fa.out"; - open (my $rep, $repfile) || die "cannot open $repfile $!\n"; - while(<$rep>) - { - chomp; - ## test the percent of repeats ## - my $string = $_; - $string =~ s/^\s+//; - next unless ($string =~ /^\d/); - $string =~ s/\s+/ /g; - my @line = split (' ',$string); - if ( exists($repeat{$line[4]}) ) - { - $repeat{$line[4]}->[0] = $line[5] if $repeat{$line[4]}->[0] > $line[5]; - $repeat{$line[4]}->[1] = $line[6] if $repeat{$line[4]}->[1] < $line[6]; - } - else{ $repeat{$line[4]} = [$line[5],$line[6]];} - } - close $rep; - - ## store in table if pair passed the repeat test ## - while (my ($k, $v) = each %repeat) - { - $notLine{$k} = 1 unless ($v->[0] > $dprct || $v->[1] < $eprct); - } - - ##write resulting reads in both files for paired ## - open(my $accepted_1, ">".$out1 ) || die "cannot open $out1 file $!\n"; - open(my $accepted_2, ">".$out2 ) || die "cannot open $out2 file $!\n"; - while ( my ($k,$v) = each %{$readsHashTabR} ) - { - if ( defined (${$v}[0]) && defined (${$v}[1]) ) - { - unless (defined ($notLine{$k}) && $notLine{$k} == 1) - { - $cmp++; - print $accepted_1 ${$v}[0]; - print $accepted_2 ${$v}[1]; - } - } - } - close $accepted_1; close $accepted_2; - - ##drop files and directories generated by repeatmasker## - my $toErase = $repout.'*'; - unlink glob "$toErase"; - unlink @garbage; rmdir $repout; - return $cmp; -} - -############################################################ -##Function that aligned paired-end reads on a referential### -############################################################ -## @param: # -## $index: referential file # -## $fasq1: first file paired end reads # -## $fasq2: second file paired end reads # -## $sam: output alignment file # -## $number_of_cpus: number of CPU used # -## $mis: tolerated mismatches # -############################################################ -sub align_paired -{ - my ($index, $fastq1, $fastq2, $sam, $number_of_cpus, $mis) = @_ ; - my @garbage = (); - my $sai1 = $sam."_temporary.sai1"; push @garbage,$sai1; - my $sai2 = $sam."_temporary.sai2"; push @garbage,$sai2; - - ##alignement with bwa - - `bwa aln -n $mis -t $number_of_cpus $index $fastq1 > $sai1 2> /dev/null`; - `bwa aln -n $mis -t $number_of_cpus $index $fastq2 > $sai2 2> /dev/null`; - `bwa sampe $index $sai1 $sai2 $fastq1 $fastq2 > $sam 2> /dev/null`; - - ## delete temporary single aligned files - unlink @garbage; -} - -############################################################ -##Function that aligned reads on a referential ### -############################################################ -## @param: # -## $index: referential file # -## $fasq: reads file # -## $sam: output alignment file # -## $number_of_cpus: number of CPU used # -############################################################ - -sub align -{ - my ($index, $fastq, $sam, $number_of_cpus ) = @_ ; - `bwa aln -o4 -e$maxInsertSize -t $number_of_cpus $index $fastq 2> /dev/null | bwa samse $index /dev/stdin $fastq > $sam 2> /dev/null`; -} - -############################################################ -##Function results computes bed files for result ### -############################################################ -## @param: # -## $out_repository: repository to store results # -## $file: sam file resulting of alignement # -## $name: name of paireds end reads file # -## $hashRef: store number for each first read value # -## $ps: number of the paired end file # -############################################################ - -sub results -{ - my ($out_repertory, $file, $name, $hashRef,$ps) = @_; - my $namefirst = $out_repertory.'/'.$name.'-first.bed'; push(@garbage, $namefirst); - my $namesecond = $out_repertory.'/'.$name.'-second.bed'; push(@garbage, $namesecond); - - ##get database forrepeatmasker - `wget https://galaxy.gred-clermont.fr/clifinder/rmsk.bed -P $out_repository `; push(@garbage, $rmsk); - - ## store reads mapped in proper pair respectively first and second in pair in bam files and transform in bed files## - `samtools view -Sb -f66 $file 2> /dev/null | bedtools bamtobed -i /dev/stdin > temp_name_first 2> /dev/null`; - `samtools view -Sb -f130 $file 2> /dev/null | bedtools bamtobed -i /dev/stdin > temp_name_second 2> /dev/null`; - - ##compute converage of second mate on rmsk## - my $baseCov = 0; - my %IdCov = (); - my @coverage = `bedtools coverage -b temp_name_second -a $rmsk`; - - - ## store coverage fraction ## - foreach my $covRmsk (@coverage) - { - chomp $covRmsk; - my @split_cov = split /\t/, $covRmsk; - ##generate identifier for IdCov ## - $split_cov[3] =~ /(.*?)\/[12]/; - ##store value in IdCov ## - if (!exists($IdCov{$1})) - { - $IdCov{$1} = $split_cov[-1]; - } - else - { - IdCov{$1} = $split_cov[-1] if $split_cov[-1] > IdCov{$1}; - } - } - - ## get only first mate that have less tant $iprct repeats ## - open (my $tmp_fi, 'temp_name_first') || die "cannot open $namefirst!\n"; - open (my $nam_fi, ">".$namefirst) || die "cannot open $namefirst!\n"; - while (<$tmp_fi>) - { - my @line = split /\t/, $_; - $line[3] =~ /(.*?)\/[12]/; - - if ($IdCov{$1} <= $iprct/100) - { - print $nam_fi $_; - - ${$hashRef}{$1}= $ps; - } - } - close $tmp_fi; close $nam_fi; - - - ## get only second mate that have less than $iprct repeats ## - - open (my $tmp_sec, 'temp_name_second') || die "cannot open $namesecond!\n"; - open (my $nam_sec, ">".$namesecond) || die "cannot open $namesecond!\n"; - while (<$tmp_sec>) - { - my @line = split /\t/, $_; - $line[3] =~ /(.*?)\/[12]/; - if ($IdCov{$1} <= $iprct/100) - { - print $nam_sec $_; - } - } - close $tmp_sec; close $nam_sec; -} - -#sub results -#{ -# my ($out_repertory, $file, $name, $hashRef,$ps) = @_; -# my $namefirst = $out_repertory.'/'.$name.'-first.bed'; push(@garbage, $namefirst); -# my $namesecond = $out_repertory.'/'.$name.'-second.bed'; push(@garbage, $namesecond); -# `samtools view -Sb -f66 $file 2> /dev/null | bedtools bamtobed -i /dev/stdin > $namefirst 2> /dev/null`; -# `samtools view -Sb -f130 $file 2> /dev/null | bedtools bamtobed -i /dev/stdin > $namesecond 2> /dev/null`; -# open( my $in, $out_repertory.'/'.$name.'-first.bed') || die "cannot open first read bed\n"; -# while (<$in>) -# { -# my @line = split /\t/, $_; -# $line[3] =~ /(.*?)\/1/; -# ${$hashRef}{$1}= $ps; -# } -#} - - -############################################################ -##Function blast: blast nucleotide sequences on ref ### -############################################################ -## @param: # -## $db:database where to search # -## $fasta: file containing nucleotide sequences # -## $tabular: out file name # -## $cpu: Number of Cpu used # -############################################################ - - - -sub blast -{ - my ($db, $fasta, $tabular, $cpus) = @_; - `blastn -db $db -query $fasta -num_threads $cpus -out $tabular -outfmt 6 -evalue 10e-10`; -} - -############################################################ -##Function extract_blast: extract result from blast ### -############################################################ -## @param: # -## $file: Name of sequences file # -## @return: hash that contains sequences # -############################################################ - - -sub extract_blast -{ - my $file = shift; - my %hash = (); - open (my $f, $file) || die "cannot open $file\n"; - while (<$f>) - { - chomp $_; - my ($seq,$id) = split /\t/,$_; - $seq = $1 if ($seq =~ /(\d+)-(.*?)-(\d+)-(\d+)/); - $hash{$seq} = [] unless exists $hash{$seq}; - push @{$hash{$seq}}, $id; - } - close $f; - return \%hash; -} - -############################################################ -##Function print_header: header of html file ### -############################################################ -## @param: # -############################################################ - - - -sub print_header -{ - my $fileR = shift; my $title = shift; - print $fileR "<!DOCTYPE html> <html> <head> <title>$title</title>"; - print $fileR "<style type=\"text/css\"> - body { font-family:Arial, Helvetica, Sans-Serif; font-size:0.8em;} - #report { border-collapse:collapse;} - #report h4 { margin:0px; padding:0px;} - #report img { float:right;} - #report ul { margin:10px 0 10px 40px; padding:0px;} - #report th { background:#7CB8E2 url(header_bkg.png) repeat-x scroll center left; color:#fff; padding:7px 15px; text-align:left;} - #report td { background:#C7DDEE none repeat-x scroll center left; color:#000; padding:7px 15px; } - #report tr.odd td { background:#fff url(row_bkg.png) repeat-x scroll center left; cursor:pointer; } - #report div.arrow { background:transparent url(arrows.png) no-repeat scroll 0px -16px; width:16px; height:16px; display:block;} - #report div.up { background-position:0px 0px;} - </style>\n"; - print $fileR " <script src=\"./jquery.min.js\" type=\"text/javascript\"></script>\n"; - print $fileR "<script type=\"text/javascript\"> - \$(document).ready(function(){ - \$(\"#report tr:odd\").addClass(\"odd\"); - \$(\"#report tr:not(.odd)\").hide(); - \$(\"#report tr:first-child\").show(); - - \$(\"#report tr.odd\").click(function(){ - \$(this).next(\"tr\").toggle(); - \$(this).find(\".arrow\").toggleClass(\"up\"); - }); - //\$(\"#report\").jExpand(); -}); -</script>"; -print $fileR "</head> <body> <table id=\"report\" >\n"; -} - -############################################################ -##Function html_tab: definition of html file ### -############################################################ -## @param: # -## $html_repository: repository to store results # -## $rna: results for know RNA # -## $est: results for known EST # -############################################################ - -sub html_tab -{ - my ($rna,$est) = @_; - my $out = $html_repertory; - - `wget https://galaxy.gred-clermont.fr/clifinder/arrows.png -P $out && wget https://galaxy.gred-clermont.fr/clifinder/row_bkg.png -P $out && wget https://galaxy.gred-clermont.fr/clifinder/jquery.min.js -P $out`; - my $chimOut = $html; - - open(my $tab, ">".$chimOut) || die "cannot open $chimOut"; - print_header($tab,"Chimerae"); - print $tab "<tr> - <th>L1 chromosome</th> - <th>L1 start</th> - <th>L1 end</th> - <th>L1 strand</th>"; - for my $i (0..$#fastq1) - { - print $tab "\t<th>$name[$i] read #</th>\n"; - } - print $tab " - <th>Chimera chromosome</th> - <th>Chimera start</th> - <th>Chimera end</th> - <th>Chimera strand</th>"; - for my $i (0..$#fastq1) - { - print $tab " <th>$name[$i] read #</th>\n"; - } - print $tab "\t<th>Known RNA</th> - \t<th>Known EST</th>\n\t<th></th>\n</tr>"; - - for my $i (0..$#results) - { - print $tab "<tr>"; - foreach my $j (@{$results[$i]}) - { - print $tab " <td>$j</td>"; - } - my ($Hrna, $Hest) = ('',''); - $Hrna = ${$rna}{$i}[0] if exists(${$rna}{$i}); - $Hest = ${$est}{$i}[0] if exists(${$est}{$i}); - my $Lrna ='link break'; my $Lest = 'link break'; - chomp $Hrna; chomp $Hest; - $Lrna = $3 if $Hrna =~/gi\|(.*?)\|(.*?)\|(.*)\|$/; - $Lest = $3 if $Hest =~/gi\|(.*?)\|(.*?)\|(.*)\|$/; - print $tab "\t<td><A HREF=\"http://www.ncbi.nlm.nih.gov/nuccore/$Lrna\">$Hrna</A></td>\n"; - print $tab "\t<td><A HREF=\"http://www.ncbi.nlm.nih.gov/nuccore/$Lest\">$Hest</A></td>\n"; - print $tab "\t<td><div class=\"arrow\"></div></td>\n</tr>\n"; - my $img = 'link break'; - $img = $i.'.svg'; - my $colspan = scalar(@fastq1) * 2 + 8 ; - print $tab "<tr>\n\t<td valign=top colspan = $colspan><img src=\"$img\"/></td>\n\t<td valign=top>"; - if (exists(${$rna}{$i})) - { - for (my $w = 1; $w <= $#{${$rna}{$i}}; $w++) - { - $Hrna = ''; - $Hrna = ${$rna}{$i}[$w]; - $Lrna ='link break'; - chomp $Hrna; - $Lrna = $3 if $Hrna =~/gi\|(.*?)\|(.*?)\|(.*)\|$/; - print $tab "<A HREF=\"http://www.ncbi.nlm.nih.gov/nuccore/$Lrna\">$Hrna</A><br>\n"; - } - delete ${$rna}{$i}; - } - print $tab "</td>\n\t<td valign=top>"; - if (exists (${$est}{$i})) - { - for (my $w = 1; $w <= $#{${$est}{$i}}; $w++) - { - $Hest = ''; - $Hest = ${$est}{$i}[$w]; - chomp $Hest; - $Lest ='link break'; - $Lest = $3 if $Hest =~/gi\|(.*?)\|(.*?)\|(.*)\|$/; - print $tab "\t<A HREF=\"http://www.ncbi.nlm.nih.gov/nuccore/$Lest\">$Hest</A><br>\n"; - } - delete ${$est}{$i}; - } - print $tab "</td>\n\t<td></td>\n</tr>\n"; - } - print $tab qw{ - </table> - }; - print $tab "<a href=\"report.txt\">Report</a>"; - print $tab qw{ - </body> - </html> - }; - close $tab; -} - -############################################################ -##Function save_csv: save results in different formats ### -############################################################ -## @param: # -############################################################ -sub save_csv{ - - my $out = $html_repertory; - my $Line_only=$html_repertory.'/'.'Line_only_hg19.txt.gz'; push(@garbage, $Line_only); #Line Only H19 database - my $Hg19_refseq=$html_repertory.'/'.'hg19_refseq.bed'; push(@garbage, $Hg19_refseq);#h19 refseq bed file - my $out1= $html_repertory.'/results.txt'; - my $out2= $html_repertory.'/first_results.txt'; - my $out3 =$html_repertory.'/final_result_chimerae.txt'; - - #load databases needed - - `wget https://galaxy.gred-clermont.fr/clifinder/Line_only_hg19.txt.gz -P $out`; - `wget https://galaxy.gred-clermont.fr/clifinder/hg19_refseq.bed -P $out `; - - - # save result in csv file ## - - my $filed = $out1; - open(my $tab, ">".$filed) || die "cannot open $file"; - print $tab "L1 chromosome \t L1 start \t L1 end \t L1 strand";; - for my $i (0..$#fastq1) - { - print $tab "\t $name[$i] read #"; - } - print $tab "\t Chimera chromosome\t Chimera start \t Chimera end \t Chimera strand"; - for my $i (0..$#fastq1) - { - print $tab "\t $name[$i] read #"; - } - print $tab "\n"; - for my $i ( 0 .. $#results ) - { - my $rowref = $results[$i]; - my $n = @$rowref - 1; - for my $j ( 0 .. $n-1 ) - { - print $tab "$results[$i][$j]\t"; - } - print $tab "$results[$i][$n]\n"; - } - close $tab; - - ##Add some information via R Scripts## - - use Statistics::R; - # Create bridge between Perl and R - my $R = Statistics::R->new(); - $R->startR; - eval{ - $R->send( - qq ' - rm(list=ls()) - library(GenomicRanges) - library(plyr) - chim<-read.delim("$out1") - - chim<-chim[order(chim[,$#fastq1+7],decreasing=F),] - chim<-chim[order(chim[,2],decreasing=F),] - chr<-sub("chr","",as.character(chim[,1])) - chim<-chim[order(as.numeric(chr)),] - - grchim <- GRanges(seqnames = chim[,1], - IRanges(start = chim[,2], - end = chim[,3]),strand=chim[,4]) - - - grchim\$ID<-paste("Id_",1:length(chim[,1]),sep="") - - mcols(grchim)<-cbind(mcols(grchim),chim[,5:($#fastq1+9)]) - - - grfusR<- union(grchim,grchim) - - position<-as.data.frame(findOverlaps(grfusR,grchim)) - - grfusR\$dup<- table(position[,1]) - - position2<-as.data.frame(findOverlaps(grfusR[grfusR\$dup>1],grchim)) - - - grfusR2<-grfusR - strand(grfusR2)<- "+" - gr3<-union(grfusR2,grfusR2) - position3<- as.data.frame(findOverlaps(gr3,grfusR2)) - gr3\$dup<-table(position3[,1]) - - position3<-as.data.frame(findOverlaps(gr3[gr3\$dup>1],grfusR2)) - grfusR\$info<-"no" - grfusR\$info [position3[,2]]<-"overlap sens opposé" - - - - grfusR\$ID<-"Id" - grfusR\$ID[position[!duplicated(position[,1]),1]]<-grchim\$ID[position[!duplicated(position[,1]),2]] - - if(nrow(position2)!=0) - { - result <- aggregate(position2[,2] ~ position2[,1], data = position2, paste, collapse = "_") - grfusR\$ID[grfusR\$dup>1]<-paste("ID",result[,2],sep="_") - } - - mcols(grfusR)<-cbind(mcols(grfusR), mcols(grchim[position[!duplicated(position[,1]),2]])) - - - - min<-ddply(as.data.frame(grchim), .(seqnames,end,strand), function(x)x[x\$Chimera.start==min(x\$Chimera.start),]) - min<-ddply(as.data.frame(min), .(seqnames,start,strand), function(x)x[x\$Chimera.start==min(x\$Chimera.start),]) - max<-ddply(as.data.frame(grchim), .(seqnames,end,strand), function(x)x[x\$Chimera.end==max(x\$Chimera.end),]) - max<-ddply(as.data.frame(max), .(seqnames,start,strand), function(x)x[x\$Chimera.end==max(x\$Chimera.end),]) - - grfusR<-as.data.frame(grfusR) - grfusR<-grfusR[order(grfusR[,1],grfusR[,2],grfusR[,3],grfusR[,4],decreasing=F),] - - grfusR\$Chimera.start<- min\$Chimera.start - grfusR\$Chimera.end<-max\$Chimera.end - - - datax<-as.data.frame(grfusR) - colnames(datax)[1:3]<-colnames(chim)[1:3] - colnames(datax)[5]<-colnames(chim)[4] - - - grchim2 <- GRanges(seqnames = datax[,$#fastq1+12], - IRanges(start = datax[,$#fastq1+13], - end = datax[,$#fastq1+14]),strand=datax[,$#fastq1+15]) - mcols(grchim2)<-datax[,-c(4,$#fastq1+12:$#fastq1+15)] - - - - grfus<- union(grchim2,grchim2) - - position<-as.data.frame(findOverlaps(grfus,grchim2)) - - grfus\$dup<- table(position[,1]) - - position2<-as.data.frame(findOverlaps(grfus[grfus\$dup>1],grchim2)) - - grchim2[ position2[,2] ] - - - - - grfus\$ID_final<-"Id" - grfus\$ID_final[position[!duplicated(position[,1]),1]]<-grchim2\$ID[position[!duplicated(position[,1]),2]] - - if(nrow(position2)!=0) - { - result <- aggregate(position2[,2] ~ position2[,1], data = position2, paste, collapse = "_") - grfus\$ID_final[grfus\$dup>1]<-paste("Id",result[,2],sep="_") - } - - - - mcols(grfus)<-cbind(mcols(grfus), mcols(grchim2[position[!duplicated(position[,1]),2]])) - - for (i in 0:$#fastq1) - { - mcols(grfus)[grfus\$dup>1,12+i] <- mcols(grfus)[grfus\$dup>1,12+i] + mcols(grchim2)[position[duplicated(position[,1]),2],10+i] - } - - - grfus2<-grfus - strand(grfus2)<-"+" - gr3<-union(grfus2,grfus2) - - position3<- as.data.frame(findOverlaps(gr3,grfus2)) - gr3\$dup<-table(position3[,1]) - - position3<-as.data.frame(findOverlaps(gr3[gr3\$dup>1],grfus2)) - - grfus\$info [position3[,2]]<-"overlap sens opposé" - - min<-ddply(as.data.frame(grchim2), .(seqnames,end,strand), function(x)x[x\$L1.start==min(x\$L1.start),]) - min<-ddply(data.frame(min), .(seqnames,start,strand), function(x)x[x\$L1.start==min(x\$L1.start),]) - max<-ddply(as.data.frame(grchim2), .(seqnames,end,strand), function(x)x[x\$L1.end==max(x\$L1.end),]) - max<-ddply(data.frame(max), .(seqnames,start,strand), function(x)x[x\$L1.end==max(x\$L1.end),]) - - grfus1<-as.data.frame(grfus) - grfus1<-grfus1[order(grfus1[,1],grfus1[,2],grfus1[,3],grfus1[,4],decreasing=F),] - - grfus1\$L1.start<- min\$L1.start - grfus1\$L1.end<-max\$L1.end - - dataf<-as.data.frame(grfus1) - - result<-( data.frame("Chimera.Chr"= dataf\$L1.chromosome, "Chimera.Start"=apply(data.frame( dataf\$start,dataf\$end,dataf\$L1.start,dataf\$L1.end ),1,min) , "Chimera.End"= apply(data.frame( dataf\$start,dataf\$end,dataf\$L1.start,dataf\$L1.end ),1,max) ,"Chimera.Strand"=dataf\$L1.strand ,"L1.Chr"= dataf\$L1.chromosome, "L1.Start"=dataf\$L1.start ,"L1.End"= dataf\$L1.end , "L1.Strand"=dataf\$L1.strand , "Unique.Chr"= dataf\$seqnames, "Unique.Start"=dataf\$start , "Unique.End"= dataf\$end , "Unique.Strand"=dataf\$strand , "ID_final"=dataf\$ID_final,"info"=dataf\$info, dataf[,18:($#fastq1+18)] ) ) - - result<-result[order(result[,2],decreasing=F),] - chr<-sub("chr","",as.character(result[,1])) - result<-result[order(as.numeric(chr)),] - options(scipen=10) - write.table(result,"$out2",sep="\t",row.names = F,quote = F) - grchim <- GRanges(seqnames = result\$L1.Chr, - IRanges(start = result\$L1.Start, - end = result\$L1.End),strand=result\$L1.Strand) - mcols(grchim)<-result - - Rep<-read.delim("$Line_only",skip=1) - - Gene<-read.delim("$Hg19_refseq") - grLINE <- GRanges(seqnames = Rep\$genoName, - IRanges(start = Rep\$genoStart, - end = Rep\$genoEnd), - repStrand=as.character(Rep\$strand), - repName =as.character(Rep\$repName)) - - grGene <- GRanges(seqnames = Gene\$chrom, - IRanges(start = Gene\$txStart, - end = Gene\$txEnd), - geneStrand=as.character(Gene\$strand), - geneName = as.character(Gene\$name2)) - - position<-as.data.frame(findOverlaps(grchim,grLINE)) - - position2<-as.data.frame(findOverlaps(grchim,grGene)) - - table(grLINE\$repName[position[,2]]) - write.table(table(grLINE\$repName[position[,2]])) - - grchim\$GeneName<-"no_gene" - - grchim\$GeneName[position2[,1]]<- grGene\$geneName[position2[,2]] - - grchim\$GeneStrand<-"*" - - grchim\$GeneStrand[position2[,1]]<- grLINE\$repStrand[position2[,2]] - - grchim\$repName<-"no" - - grchim\$repName[position[,1]]<- grLINE\$repName[position[,2]] - - grchim\$repStart<-0 - - grchim\$repStart[position[,1]]<-start(grLINE[position[,2]]) - - grchim\$repEnd<-0 - - grchim\$repEnd[position[,1]]<-end(grLINE[position[,2]]) - - grchim\$repWidth<-0 - - grchim\$repWidth[position[,1]]<-width(grLINE[position[,2]]) - - grchim\$repStrand<-"*" - - grchim\$repStrand[position[,1]]<- grLINE\$repStrand[position[,2]] - - dup<-position[duplicated(position[,1]),1] - if(length(dup != 0)) - { - for (i in 1:length(dup)) - { - - grchim\$repName[dup[i]] <-paste(grLINE\$repName[position[position[,1]==dup[i],2]],collapse="/") - grchim\$repStart[dup[i]] <-paste(start(grLINE[position[position[,1]==dup[i],2]]),collapse="/") - grchim\$repEnd[dup[i]] <-paste(end(grLINE[position[position[,1]==dup[i],2]]),collapse="/") - grchim\$repWidth[dup[i]] <-paste(width(grLINE[position[position[,1]==dup[i],2]]),collapse="/") - grchim\$repStrand[dup[i]] <-paste(grLINE\$repStrand[position[position[,1]==dup[i],2]],collapse="/") - - } - } - - final_result<-as.data.frame(grchim) - options(scipen=10) - write.table(final_result[,-c(1:5)],"$out3",sep="\t",row.names = F,quote = F) - ' - ); - }; - - $R->stop(); - unlink @garbage; - - - - -} - -
--- a/LINE_html_4_1.xml Mon Aug 21 05:06:56 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,111 +0,0 @@ -<tool name="chimericLineFinder4" id="LINE mosaic transcripts finder4" version="0.4.1"> - <description>Find chimerics transcripts containing LINEs sequences</description> - <command interpreter="perl"> - - LINE_html4.1.pl - - --first ${first} - --name ${first.name} - --second ${second} - - - #if $Genome.refGenomeSource == "history": - --ref "${Genome.ownFile}" - --build_index - #else: - --ref "${Genome.indices.fields.path}" - #end if - - #if $TE.refGenomeSource == "history": - --TE "${TE.ownFile}" - --build_TE - #else: - --TE "${TE.indices.fields.path}" - #end if - - - #for $input_file in $input_files: - --first ${input_file.first} - --name ${input_file.first.name} - --second ${input_file.second} - #end for - --html $chimerae - --html_path ${chimerae.files_path} - --size_insert $size_insert - --size_read $size - --pourcentage $prct - --image $image - --BDir $BDir - --minL1 $minL1 - --mis_L1 $mis_L1 - - - </command> - - <requirements> - <requirement type="package" version="3.1.0">R</requirement> - <requirement type="package" version="0.7.12">bwa</requirement> - <requirement type="package" version="2.17.0">bedTools</requirement> - </requirements> - - <inputs> - <param format="fastqsanger" name="first" type="data" label="first set of paired-end reads" help=""/> - <param format="fastqsanger" name="second" type="data" label="second set of paired-end reads" help=""/> - <repeat name="input_files" title="Additional Fastq Files"> - <param name="first" type="data" format="fastqsanger" label="FASTQ file" help="first set of paired-end reads" /> - <param name="second" type="data" format="fastqsanger" label="FASTQ file" help="second set of paired-end reads" /> - </repeat> - - <conditional name="Genome"> - <param name="refGenomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?"> - <option value="indexed">Use a built-in index</option> - <option value="history">Use one from the history</option> - </param> - <when value="indexed"> - <param name="indices" type="select" label="Select a reference genome"> - <options from_data_table="bwa_indexes"> - <filter type="sort_by" column="2" /> - <validator type="no_options" message="No indexes are available" /> - </options> - </param> - </when> - <when value="history"> - <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select a reference from history" /> - </when> - </conditional> - - - <conditional name="TE"> - <param name="refGenomeSource" type="select" label="Will you select TE database from your history or use a built-in index?"> - <option value="indexed">Use a built-in index</option> - <option value="history">Use one from the history</option> - </param> - <when value="indexed"> - <param name="indices" type="select" label="Select a TE reference"> - <options from_data_table="bwa_indexes"> - <filter type="sort_by" column="2" /> - <validator type="no_options" message="No indexes are available" /> - </options> - </param> - </when> - <when value="history"> - <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select a reference from history" /> - </when> - </conditional> - - <param name="BDir" type="select" > - <option value="0">Undirectional libraries</option> - <option value="1">TEs sequences in first read in pair</option> - <option value="2">TEs sequences in second read in pair</option> - </param> - - <param name="size" type="integer" value="100" label="reads size"/> - <param name="size_insert" type="integer" value="250" label="maximum insert size (bp)"/> - <param name ="minL1" type="integer" value="50" label="minimun bp mapping on selected TEs database"/> - <param name ="mis_L1" type="integer" value="2" label="number of mismatches tolerated in TEs mapping sequences"/> - <param name="prct" type="integer" value="33" label="minimum consecutive bp corresponding to a unique sequence"/> - </inputs> - <outputs> - <data format="html" name="chimerae" label="${tool.name}_on_${on_string}"/> - </outputs> -</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/bwa_index.loc Mon Aug 21 05:13:22 2017 -0400 @@ -0,0 +1,14 @@ +#-rw-r--r-- 1 james universe 830134 2005-09-13 10:12 phiX.fa.amb +#-rw-r--r-- 1 james universe 527388 2005-09-13 10:12 phiX.fa.ann +#-rw-r--r-- 1 james universe 269808 2005-09-13 10:12 phiX.fa.bwt +#...etc... +# +#Your bwa_index.loc file should include an entry per line for each +#index set you have stored. The "file" in the path does not actually +#exist, but it is the prefix for the actual index files. For example: +# +#phiX174 phiX phiX174 /depot/data2/galaxy/phiX/base/phiX.fa +#hg18canon hg18 hg18 Canonical /depot/data2/galaxy/hg18/base/hg18canon.fa +#hg18full hg18 hg18 Full /depot/data2/galaxy/hg18/base/hg18full.fa +#/orig/path/hg19.fa hg19 hg19 /depot/data2/galaxy/hg19/base/hg19.fa +#...etc... \ No newline at end of file
--- a/tool_data_table_conf.xml Mon Aug 21 05:06:56 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ -<tables> - <!-- Locations of configurations --> - <table name="bwa_indexes" comment_char="#"> - <columns>value, dbkey, name, path</columns> - <file path="tool-data/bwa_index.loc" /> - </table> -</tables> \ No newline at end of file