# HG changeset patch # User mcharles # Date 1407836973 14400 # Node ID 448b10ffb0958bbae7201212aa70503247012f02 Uploaded diff -r 000000000000 -r 448b10ffb095 genephys/Galaxy-Workflow-GenePhys.ga --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genephys/Galaxy-Workflow-GenePhys.ga Tue Aug 12 05:49:33 2014 -0400 @@ -0,0 +1,674 @@ +{ + "a_galaxy_workflow": "true", + "annotation": "", + "format-version": "0.1", + "name": "GenePhys", + "steps": { + "0": { + "annotation": "", + "id": 0, + "input_connections": {}, + "inputs": [ + { + "description": "", + "name": "ASSEMBLY" + } + ], + "name": "Input dataset", + "outputs": [], + "position": { + "left": 201, + "top": 392 + }, + "tool_errors": null, + "tool_id": null, + "tool_state": "{\"name\": \"ASSEMBLY\"}", + "tool_version": null, + "type": "data_input", + "user_outputs": [] + }, + "1": { + "annotation": "", + "id": 1, + "input_connections": {}, + "inputs": [ + { + "description": "", + "name": "GENE XML" + } + ], + "name": "Input dataset", + "outputs": [], + "position": { + "left": 200, + "top": 497 + }, + "tool_errors": null, + "tool_id": null, + "tool_state": "{\"name\": \"GENE XML\"}", + "tool_version": null, + "type": "data_input", + "user_outputs": [] + }, + "2": { + "annotation": "", + "id": 2, + "input_connections": {}, + "inputs": [ + { + "description": "", + "name": "MARKERS" + } + ], + "name": "Input dataset", + "outputs": [], + "position": { + "left": 186, + "top": 618 + }, + "tool_errors": null, + "tool_id": null, + "tool_state": "{\"name\": \"MARKERS\"}", + "tool_version": null, + "type": "data_input", + "user_outputs": [] + }, + "3": { + "annotation": "", + "id": 3, + "input_connections": {}, + "inputs": [ + { + "description": "", + "name": "GENETIC MAP" + } + ], + "name": "Input dataset", + "outputs": [], + "position": { + "left": 200, + "top": 737 + }, + "tool_errors": null, + "tool_id": null, + "tool_state": "{\"name\": \"GENETIC MAP\"}", + "tool_version": null, + "type": "data_input", + "user_outputs": [] + }, + "4": { + "annotation": "", + "id": 4, + "input_connections": {}, + "inputs": [ + { + "description": "", + "name": "REFERENCE(NUC)" + } + ], + "name": "Input dataset", + "outputs": [], + "position": { + "left": 200, + "top": 857 + }, + "tool_errors": null, + "tool_id": null, + "tool_state": "{\"name\": \"REFERENCE(NUC)\"}", + "tool_version": null, + "type": "data_input", + "user_outputs": [] + }, + "5": { + "annotation": "", + "id": 5, + "input_connections": {}, + "inputs": [ + { + "description": "", + "name": "REFERENCE(PROT)" + } + ], + "name": "Input dataset", + "outputs": [], + "position": { + "left": 206, + "top": 982 + }, + "tool_errors": null, + "tool_id": null, + "tool_state": "{\"name\": \"REFERENCE(PROT)\"}", + "tool_version": null, + "type": "data_input", + "user_outputs": [] + }, + "6": { + "annotation": "", + "id": 6, + "input_connections": { + "input_geneticmap": { + "id": 3, + "output_name": "output" + }, + "input_markers": { + "id": 2, + "output_name": "output" + } + }, + "inputs": [], + "name": "extractgenomicsegment", + "outputs": [ + { + "name": "output_file", + "type": "fasta" + } + ], + "position": { + "left": 442, + "top": 200 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "extractgenomicsegment", + "tool_state": "{\"__page__\": 0, \"__rerun_remap_job_id__\": null, \"window\": \"\\\"200000\\\"\", \"input_geneticmap\": \"null\", \"offset\": \"\\\"100000\\\"\", \"chromInfo\": \"\\\"/home/galaxy/galaxy-dist/tool-data/shared/ucsc/chrom/?.len\\\"\", \"input_markers\": \"null\"}", + "tool_version": "0.01", + "type": "tool", + "user_outputs": [] + }, + "7": { + "annotation": "", + "id": 7, + "input_connections": { + "input_fasta": { + "id": 4, + "output_name": "output" + } + }, + "inputs": [], + "name": "fastaGroomerForMakeBlastdb", + "outputs": [ + { + "name": "output_fasta", + "type": "fasta" + } + ], + "position": { + "left": 385, + "top": 811 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "fastaGroomerForMakeBlastdb", + "tool_state": "{\"input_fasta\": \"null\", \"__rerun_remap_job_id__\": null, \"chromInfo\": \"\\\"/home/galaxy/galaxy-dist/tool-data/shared/ucsc/chrom/?.len\\\"\", \"__page__\": 0}", + "tool_version": "0.01", + "type": "tool", + "user_outputs": [] + }, + "8": { + "annotation": "", + "id": 8, + "input_connections": { + "input_fasta": { + "id": 5, + "output_name": "output" + } + }, + "inputs": [], + "name": "fastaGroomerForMakeBlastdb", + "outputs": [ + { + "name": "output_fasta", + "type": "fasta" + } + ], + "position": { + "left": 489, + "top": 955 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "fastaGroomerForMakeBlastdb", + "tool_state": "{\"input_fasta\": \"null\", \"__rerun_remap_job_id__\": null, \"chromInfo\": \"\\\"/home/galaxy/galaxy-dist/tool-data/shared/ucsc/chrom/?.len\\\"\", \"__page__\": 0}", + "tool_version": "0.01", + "type": "tool", + "user_outputs": [] + }, + "9": { + "annotation": "", + "id": 9, + "input_connections": { + "input_genexml": { + "id": 1, + "output_name": "output" + }, + "input_segment": { + "id": 6, + "output_name": "output_file" + } + }, + "inputs": [], + "name": "extractgenesfromsegment", + "outputs": [ + { + "name": "output_gene_nuc", + "type": "fasta" + }, + { + "name": "output_gene_prot", + "type": "fasta" + }, + { + "name": "output_gene_function", + "type": "txt" + } + ], + "position": { + "left": 782, + "top": 207 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "extractgenesfromsegment", + "tool_state": "{\"__page__\": 0, \"input_genexml\": \"null\", \"input_segment\": \"null\", \"chromInfo\": \"\\\"/home/galaxy/galaxy-dist/tool-data/shared/ucsc/chrom/?.len\\\"\", \"__rerun_remap_job_id__\": null}", + "tool_version": "0.01", + "type": "tool", + "user_outputs": [] + }, + "10": { + "annotation": "", + "id": 10, + "input_connections": { + "input_assembly": { + "id": 0, + "output_name": "output" + }, + "input_segment": { + "id": 6, + "output_name": "output_file" + } + }, + "inputs": [], + "name": "extractgenomicsequencefromsegment", + "outputs": [ + { + "name": "output_file", + "type": "fasta" + } + ], + "position": { + "left": 640, + "top": 497 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "extractgenomicsequencefromsegment", + "tool_state": "{\"__page__\": 0, \"__rerun_remap_job_id__\": null, \"input_segment\": \"null\", \"chromInfo\": \"\\\"/home/galaxy/galaxy-dist/tool-data/shared/ucsc/chrom/?.len\\\"\", \"input_assembly\": \"null\"}", + "tool_version": "0.01", + "type": "tool", + "user_outputs": [] + }, + "11": { + "annotation": "", + "id": 11, + "input_connections": { + "input_file": { + "id": 7, + "output_name": "output_fasta" + } + }, + "inputs": [], + "name": "NCBI BLAST+ makeblastdb", + "outputs": [ + { + "name": "outfile", + "type": "data" + } + ], + "position": { + "left": 844, + "top": 840 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/devteam/ncbi_blast_plus/ncbi_makeblastdb/0.1.00", + "tool_state": "{\"__page__\": 0, \"mask_data_file\": \"null\", \"input_file\": \"null\", \"dbtype\": \"\\\"nucl\\\"\", \"__rerun_remap_job_id__\": null, \"hash_index\": \"\\\"True\\\"\", \"tax\": \"{\\\"taxselect\\\": \\\"\\\", \\\"__current_case__\\\": 0}\", \"title\": \"\\\"\\\"\", \"chromInfo\": \"\\\"/home/galaxy/galaxy-dist/tool-data/shared/ucsc/chrom/?.len\\\"\", \"parse_seqids\": \"\\\"False\\\"\"}", + "tool_version": "0.1.00", + "type": "tool", + "user_outputs": [] + }, + "12": { + "annotation": "", + "id": 12, + "input_connections": { + "input_file": { + "id": 8, + "output_name": "output_fasta" + } + }, + "inputs": [], + "name": "NCBI BLAST+ makeblastdb", + "outputs": [ + { + "name": "outfile", + "type": "data" + } + ], + "position": { + "left": 692, + "top": 673 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/devteam/ncbi_blast_plus/ncbi_makeblastdb/0.1.00", + "tool_state": "{\"__page__\": 0, \"mask_data_file\": \"null\", \"input_file\": \"null\", \"dbtype\": \"\\\"prot\\\"\", \"__rerun_remap_job_id__\": null, \"hash_index\": \"\\\"True\\\"\", \"tax\": \"{\\\"taxselect\\\": \\\"\\\", \\\"__current_case__\\\": 0}\", \"title\": \"\\\"\\\"\", \"chromInfo\": \"\\\"/home/galaxy/galaxy-dist/tool-data/shared/ucsc/chrom/?.len\\\"\", \"parse_seqids\": \"\\\"False\\\"\"}", + "tool_version": "0.1.00", + "type": "tool", + "user_outputs": [] + }, + "13": { + "annotation": "", + "id": 13, + "input_connections": { + "db_opts|histdb": { + "id": 11, + "output_name": "outfile" + }, + "query": { + "id": 9, + "output_name": "output_gene_nuc" + } + }, + "inputs": [], + "name": "NCBI BLAST+ blastn", + "outputs": [ + { + "name": "output1", + "type": "tabular" + } + ], + "position": { + "left": 1100, + "top": 334 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/devteam/ncbi_blast_plus/ncbi_blastn_wrapper/0.1.00", + "tool_state": "{\"evalue_cutoff\": \"\\\"0.001\\\"\", \"__page__\": 0, \"adv_opts\": \"{\\\"adv_opts_selector\\\": \\\"basic\\\", \\\"__current_case__\\\": 0}\", \"__rerun_remap_job_id__\": null, \"db_opts\": \"{\\\"db_opts_selector\\\": \\\"histdb\\\", \\\"subject\\\": \\\"\\\", \\\"histdb\\\": null, \\\"__current_case__\\\": 1, \\\"database\\\": \\\"\\\"}\", \"query\": \"null\", \"blast_type\": \"\\\"blastn\\\"\", \"output\": \"{\\\"out_format\\\": \\\"cols\\\", \\\"std_cols\\\": [\\\"qseqid\\\", \\\"sseqid\\\", \\\"pident\\\", \\\"length\\\", \\\"mismatch\\\", \\\"gapopen\\\", \\\"qstart\\\", \\\"qend\\\", \\\"sstart\\\", \\\"send\\\", \\\"evalue\\\", \\\"bitscore\\\"], \\\"ids_cols\\\": null, \\\"tax_cols\\\": null, \\\"__current_case__\\\": 2, \\\"misc_cols\\\": null, \\\"ext_cols\\\": [\\\"nident\\\", \\\"positive\\\", \\\"qlen\\\", \\\"slen\\\", \\\"salltitles\\\"]}\", \"chromInfo\": \"\\\"/home/galaxy/galaxy-dist/tool-data/shared/ucsc/chrom/?.len\\\"\"}", + "tool_version": "0.1.00", + "type": "tool", + "user_outputs": [] + }, + "14": { + "annotation": "", + "id": 14, + "input_connections": { + "db_opts|histdb": { + "id": 11, + "output_name": "outfile" + }, + "query": { + "id": 9, + "output_name": "output_gene_nuc" + } + }, + "inputs": [], + "name": "NCBI BLAST+ tblastx", + "outputs": [ + { + "name": "output1", + "type": "tabular" + } + ], + "position": { + "left": 1107, + "top": 685.5 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/devteam/ncbi_blast_plus/ncbi_tblastx_wrapper/0.1.00", + "tool_state": "{\"evalue_cutoff\": \"\\\"0.001\\\"\", \"__page__\": 0, \"adv_opts\": \"{\\\"adv_opts_selector\\\": \\\"basic\\\", \\\"__current_case__\\\": 0}\", \"__rerun_remap_job_id__\": null, \"db_opts\": \"{\\\"db_opts_selector\\\": \\\"histdb\\\", \\\"subject\\\": \\\"\\\", \\\"histdb\\\": null, \\\"__current_case__\\\": 1, \\\"database\\\": \\\"\\\"}\", \"query_gencode\": \"\\\"1\\\"\", \"output\": \"{\\\"out_format\\\": \\\"cols\\\", \\\"std_cols\\\": [\\\"qseqid\\\", \\\"sseqid\\\", \\\"pident\\\", \\\"length\\\", \\\"mismatch\\\", \\\"gapopen\\\", \\\"qstart\\\", \\\"qend\\\", \\\"sstart\\\", \\\"send\\\", \\\"evalue\\\", \\\"bitscore\\\"], \\\"ids_cols\\\": null, \\\"tax_cols\\\": null, \\\"__current_case__\\\": 2, \\\"misc_cols\\\": null, \\\"ext_cols\\\": [\\\"nident\\\", \\\"positive\\\", \\\"qlen\\\", \\\"slen\\\", \\\"salltitles\\\"]}\", \"query\": \"null\"}", + "tool_version": "0.1.00", + "type": "tool", + "user_outputs": [] + }, + "15": { + "annotation": "", + "id": 15, + "input_connections": { + "db_opts|histdb": { + "id": 12, + "output_name": "outfile" + }, + "query": { + "id": 10, + "output_name": "output_file" + } + }, + "inputs": [], + "name": "NCBI BLAST+ blastx", + "outputs": [ + { + "name": "output1", + "type": "tabular" + } + ], + "position": { + "left": 1104, + "top": 451 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/devteam/ncbi_blast_plus/ncbi_blastx_wrapper/0.1.00", + "tool_state": "{\"evalue_cutoff\": \"\\\"0.001\\\"\", \"__page__\": 0, \"adv_opts\": \"{\\\"adv_opts_selector\\\": \\\"basic\\\", \\\"__current_case__\\\": 0}\", \"__rerun_remap_job_id__\": null, \"db_opts\": \"{\\\"db_opts_selector\\\": \\\"histdb\\\", \\\"subject\\\": \\\"\\\", \\\"histdb\\\": null, \\\"__current_case__\\\": 1, \\\"database\\\": \\\"\\\"}\", \"query_gencode\": \"\\\"1\\\"\", \"query\": \"null\", \"output\": \"{\\\"out_format\\\": \\\"cols\\\", \\\"std_cols\\\": [\\\"qseqid\\\", \\\"sseqid\\\", \\\"pident\\\", \\\"length\\\", \\\"mismatch\\\", \\\"gapopen\\\", \\\"qstart\\\", \\\"qend\\\", \\\"sstart\\\", \\\"send\\\", \\\"evalue\\\", \\\"bitscore\\\"], \\\"ids_cols\\\": null, \\\"tax_cols\\\": null, \\\"__current_case__\\\": 2, \\\"misc_cols\\\": null, \\\"ext_cols\\\": [\\\"nident\\\", \\\"positive\\\", \\\"qlen\\\", \\\"slen\\\", \\\"salltitles\\\"]}\", \"chromInfo\": \"\\\"/home/galaxy/galaxy-dist/tool-data/shared/ucsc/chrom/?.len\\\"\"}", + "tool_version": "0.1.00", + "type": "tool", + "user_outputs": [] + }, + "16": { + "annotation": "", + "id": 16, + "input_connections": { + "db_opts|histdb": { + "id": 12, + "output_name": "outfile" + }, + "query": { + "id": 9, + "output_name": "output_gene_prot" + } + }, + "inputs": [], + "name": "NCBI BLAST+ blastp", + "outputs": [ + { + "name": "output1", + "type": "tabular" + } + ], + "position": { + "left": 1106, + "top": 568.5 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/devteam/ncbi_blast_plus/ncbi_blastp_wrapper/0.1.00", + "tool_state": "{\"evalue_cutoff\": \"\\\"0.001\\\"\", \"__page__\": 0, \"adv_opts\": \"{\\\"adv_opts_selector\\\": \\\"basic\\\", \\\"__current_case__\\\": 0}\", \"__rerun_remap_job_id__\": null, \"blast_type\": \"\\\"blastp\\\"\", \"db_opts\": \"{\\\"db_opts_selector\\\": \\\"histdb\\\", \\\"subject\\\": \\\"\\\", \\\"histdb\\\": null, \\\"__current_case__\\\": 1, \\\"database\\\": \\\"\\\"}\", \"output\": \"{\\\"out_format\\\": \\\"cols\\\", \\\"std_cols\\\": [\\\"qseqid\\\", \\\"sseqid\\\", \\\"pident\\\", \\\"length\\\", \\\"mismatch\\\", \\\"gapopen\\\", \\\"qstart\\\", \\\"qend\\\", \\\"sstart\\\", \\\"send\\\", \\\"evalue\\\", \\\"bitscore\\\"], \\\"ids_cols\\\": null, \\\"tax_cols\\\": null, \\\"__current_case__\\\": 2, \\\"misc_cols\\\": null, \\\"ext_cols\\\": [\\\"nident\\\", \\\"positive\\\", \\\"qlen\\\", \\\"slen\\\", \\\"salltitles\\\"]}\", \"query\": \"null\"}", + "tool_version": "0.1.00", + "type": "tool", + "user_outputs": [] + }, + "17": { + "annotation": "", + "id": 17, + "input_connections": { + "input_blast": { + "id": 13, + "output_name": "output1" + } + }, + "inputs": [], + "name": "parseblasttab", + "outputs": [ + { + "name": "output_merge", + "type": "txt" + }, + { + "name": "output_best", + "type": "txt" + } + ], + "position": { + "left": 1365, + "top": 333.5 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "parseblasttab", + "tool_state": "{\"__page__\": 0, \"__rerun_remap_job_id__\": null, \"input_blast\": \"null\"}", + "tool_version": "0.01", + "type": "tool", + "user_outputs": [] + }, + "18": { + "annotation": "", + "id": 18, + "input_connections": { + "input_blast": { + "id": 14, + "output_name": "output1" + } + }, + "inputs": [], + "name": "parseblasttab", + "outputs": [ + { + "name": "output_merge", + "type": "txt" + }, + { + "name": "output_best", + "type": "txt" + } + ], + "position": { + "left": 1368, + "top": 776.5 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "parseblasttab", + "tool_state": "{\"__page__\": 0, \"__rerun_remap_job_id__\": null, \"input_blast\": \"null\"}", + "tool_version": "0.01", + "type": "tool", + "user_outputs": [] + }, + "19": { + "annotation": "", + "id": 19, + "input_connections": { + "input_blast": { + "id": 15, + "output_name": "output1" + } + }, + "inputs": [], + "name": "parseblasttab", + "outputs": [ + { + "name": "output_merge", + "type": "txt" + }, + { + "name": "output_best", + "type": "txt" + } + ], + "position": { + "left": 1366, + "top": 477.5 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "parseblasttab", + "tool_state": "{\"__page__\": 0, \"__rerun_remap_job_id__\": null, \"input_blast\": \"null\"}", + "tool_version": "0.01", + "type": "tool", + "user_outputs": [] + }, + "20": { + "annotation": "", + "id": 20, + "input_connections": { + "input_blast": { + "id": 16, + "output_name": "output1" + } + }, + "inputs": [], + "name": "parseblasttab", + "outputs": [ + { + "name": "output_merge", + "type": "txt" + }, + { + "name": "output_best", + "type": "txt" + } + ], + "position": { + "left": 1372, + "top": 623.5 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "parseblasttab", + "tool_state": "{\"__page__\": 0, \"__rerun_remap_job_id__\": null, \"input_blast\": \"null\"}", + "tool_version": "0.01", + "type": "tool", + "user_outputs": [] + }, + "21": { + "annotation": "", + "id": 21, + "input_connections": { + "input_blastn": { + "id": 17, + "output_name": "output_best" + }, + "input_blastp": { + "id": 20, + "output_name": "output_best" + }, + "input_blastx": { + "id": 19, + "output_name": "output_best" + }, + "input_tblastx": { + "id": 18, + "output_name": "output_best" + } + }, + "inputs": [], + "name": "mergeAllBestBlast", + "outputs": [ + { + "name": "output_results", + "type": "txt" + } + ], + "position": { + "left": 1739.5, + "top": 384.5 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "mergeAllBestBlast", + "tool_state": "{\"__page__\": 0, \"input_tblastx\": \"null\", \"input_blastp\": \"null\", \"input_blastx\": \"null\", \"__rerun_remap_job_id__\": null, \"input_blastn\": \"null\"}", + "tool_version": "0.01", + "type": "tool", + "user_outputs": [] + } + } +} \ No newline at end of file diff -r 000000000000 -r 448b10ffb095 genephys/extractgenesfromsegment.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genephys/extractgenesfromsegment.pl Tue Aug 12 05:49:33 2014 -0400 @@ -0,0 +1,223 @@ +#!/usr/bin/perl +use strict; + +my $input_gene_file = $ARGV[0]; +my $input_segment_file = $ARGV[1]; +my $output_seq_nuc = $ARGV[2]; +my $output_seq_prot = $ARGV[3]; + +open(IG, $input_gene_file) or die("Can't open $input_gene_file\n"); +open(IS, $input_segment_file) or die("Can't open $input_segment_file\n"); +open (ON,">$output_seq_nuc") or die ("Can't open $output_seq_nuc\n"); +open (OP,">$output_seq_prot") or die ("Can't open $output_seq_prot\n"); + +my $current_annotation=""; +my @gene_annotation; +my @list_gene; +my @current_gene; +my %current_gene_annotation; + +# while ((my $line=)&&($#list_gene<5)){ +while (my $line=){ + if ($line =~/\/){ + if (@current_gene){ + my %current_gene_annotation = %{&extract_annotation(\@current_gene)}; + + + push(@list_gene,\%current_gene_annotation); + undef @current_gene; + } + push (@current_gene,$line); + + } + else { + push (@current_gene,$line); + } +} +close(IG); + +# for (my $i=0;$i<=$#list_gene;$i++){ + # my %current_gene_annotation = %{$list_gene[$i]}; + # foreach my $key (keys %current_gene_annotation){ + # print "TEST ",$key,"\t",$current_gene_annotation{$key},"\n"; + # } +# } + +# my @segment_chr; +# my @segment_start; +# my @segment_stop; + +while (my $line=){ + print "\n$line"; + if ($line =~/(.*?)\:(\d+)\.\.(\d+)/){ + my $chr = $1; + my $start = $2; + my $stop = $3; + + my @list_gene_selected = @{&extract_gene_from_position($chr,$start,$stop,\@list_gene)}; + + if ($#list_gene_selected>=0){ + for (my $i=0;$i<=$#list_gene_selected;$i++){ + my %current = %{$list_gene_selected[$i]},"\n"; + print $current{"00 BN_Id"},"\t",$current{"01 BN_Position"},"\t",$current{"02 ATH_Function"},"\t",$current{"03 ATH_Id"},"\n"; + + my $seq = $current{"04 Sequence"}; + my $formated_seq; + my @SEQ = split(//,$seq); + my $compt_seq=0; + for (my $i=0;$i<=$#SEQ;$i++){ + if ($SEQ[$i] =~ /[ATGNCXatgcnx]/){ + if ($compt_seq == 60){ + $formated_seq .="\n"; + $compt_seq=0; + } + $formated_seq.= $SEQ[$i]; + $compt_seq ++; + } + } + print ON ">",$current{"01 BN_Position"}," (",$current{"00 BN_Id"},")","\n",$formated_seq,"\n"; + + my $prot = $current{"05 Protein"}; + my $formated_prot; + my @PROT = split(//,$prot); + my $compt_prot=0; + for (my $i=0;$i<=$#PROT;$i++){ + if ($PROT[$i] =~ /[A-Za-z\*\+]/){ + if ($compt_prot == 60){ + $formated_prot .="\n"; + $compt_prot=0; + } + $formated_prot.= $PROT[$i]; + $compt_prot ++; + } + } + print OP ">",$current{"01 BN_Position"}," (",$current{"00 BN_Id"},")","\n",$formated_prot,"\n"; + + # foreach my $key (sort keys %current){ + # print " ",$key,"\t",$current{$key},"\n"; + # } + # print "\n"; + } + } + else { + print " NO GENE FOUND\n"; + } + } + else { + print "Error Parsing n°2 : $line\n"; + } +} + +close (IS); + +close (ON); +close (OP); + + +# my @list_gene_selected = @{&extract_gene_from_position("chrA01",1437,3000,\@list_gene)}; + + +# for (my $i=0;$i<=$#list_gene_selected;$i++){ + # my %current = %{$list_gene_selected[$i]},"\n"; + # foreach my $key (keys %current){ + # print $key,"\t",$current{$key},"\n"; + # } +# } + + + + + + + +sub extract_annotation{ + my $ref = shift; + my @gene = @$ref; + my %gene_annotation; + for (my $i=0;$i<=$#gene;$i++){ + #print "TEST : $gene[$i]\n"; + if ($gene[$i]=~/\(.*?)\<\/Id\>/){ + $gene_annotation{"00 BN_Id"} = $1; + } + elsif ($gene[$i]=~/\(.*?)\<\/Position\>/){ + $gene_annotation{"01 BN_Position"} = $1; + } + elsif ($gene[$i]=~/\(.*?)\<\/ATH_Function\>/){ + $gene_annotation{"02 ATH_Function"} = $1; + } + elsif ($gene[$i]=~/\(.*?)\<\/SId\>/){ + $gene_annotation{"03 ATH_Id"} = $1; + } + elsif ($gene[$i]=~/\(.*?)\<\/CDS_Sequence\>/){ #modif 1.11 + $gene_annotation{"04 Sequence"} = $1; + } + elsif ($gene[$i]=~/\(.*?)\<\/Protein\>/){ + $gene_annotation{"05 Protein"} = $1; + # print "TEST : $1\n"; + # exit (0); + } + } + if ((!$gene_annotation{"00 BN_Id"})||(!$gene_annotation{"01 BN_Position"})||(!$gene_annotation{"04 Sequence"})||(!$gene_annotation{"05 Protein"})){ + + print "Erreur Parsing n°3\n"; + print "Id :",$gene_annotation{"00 BN_Id"},"\n"; + print "Position : ",$gene_annotation{"01 BN_Position"},"\n"; + print "ATH Function : ",$gene_annotation{"02 ATH_Function"},"\n"; + print "ATH Id : ",$gene_annotation{"03 ATH_Id"},"\n"; + print "CDS seq : ",$gene_annotation{"04 Sequence"},"\n"; + print "CDS prot : ",$gene_annotation{"05 Protein"},"\n"; + for (my $i=0;$i<=$#gene;$i++){ + print $gene[$i],"\n"; + } + + exit(0); + + } + + return \%gene_annotation; +} + + +sub extract_gene_from_position{ + my $chr = shift; + my $start = shift; + my $end = shift; + + my $ref = shift; + my @list_gene = @$ref; + my @list_gene_selected; + + for (my $i=0;$i<=$#list_gene;$i++){ + my %current_gene_annotation = %{$list_gene[$i]}; + my $current_position = $current_gene_annotation{"01 BN_Position"}; + my $current_chr; + my $current_start; + my $current_end; + + #Extraction de la position + if ($current_position =~ /^(.*?)\:(\d+)[\.]+(\d+)/){ # modif 1.11 + $current_chr = $1; + $current_start = $2; + $current_end = $3; + if ($current_start > $current_end){ + ($current_start,$current_end) = ($current_end,$current_start); + } + } + else { + print "Erreur Parsing n°1\npos : $current_position\n"; + exit(0); + } + #Test de selection + if ($chr eq $current_chr){ + if ( + ($current_end>=$start)&&($current_end<=$end) || + ($current_start>=$start)&&($current_start<=$end) + ) + { + push(@list_gene_selected,$list_gene[$i]); + } + } + + } + return \@list_gene_selected; +} \ No newline at end of file diff -r 000000000000 -r 448b10ffb095 genephys/extractgenesfromsegment.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genephys/extractgenesfromsegment.xml Tue Aug 12 05:49:33 2014 -0400 @@ -0,0 +1,19 @@ + +Extract gene sequence (nucleic, proteic) and function + + extractgenesfromsegment.pl $input_genexml $input_segment $output_gene_nuc $output_gene_prot > $output_gene_function + + + + + + + + + + + + + + + diff -r 000000000000 -r 448b10ffb095 genephys/extractgenomicsegment.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genephys/extractgenomicsegment.pl Tue Aug 12 05:49:33 2014 -0400 @@ -0,0 +1,176 @@ +#!/usr/bin/perl +#V1.10 +use strict; + + +my $inputfile1 = $ARGV[0]; +my $inputfile2 = $ARGV[1]; +my $WINDOW = $ARGV[2]; +my $OFFSET = $ARGV[3]; + +if (!$WINDOW){$WINDOW = 200000;} +if (!$OFFSET){$OFFSET = 100000;} + +open(IF1, $inputfile1) or die("Can't open $inputfile1\n"); +open(IF2, $inputfile2) or die("Can't open $inputfile2\n"); +my $current_annotation=""; +my @list_marquer; +my %chr; +my %position; + +# print "$inputfile2\n"; + +while (my $line=){ + my @cols = split(/\t/,$line); + my %current; + # Number#Map#Name#Chr#Position#GeneAT#FunctionAT + + my $Number = $cols[0]; + my $Map = $cols[2]; + my $Name = $cols[7]; + my $Locus = $cols[8]; + my $Chr = $cols[19]; + my $Position = $cols[20]; + $Position =~ s/\s+//g; + my $GeneAT=$cols[32]; + my $FunctionAT=$cols[37]; + $chr{$Name} = $Chr; + $position{$Name} = $Position; + + ### Modification 1.10 + if ($Locus ne $Name){ + $chr{$Locus} = $Chr; + $position{$Locus} = $Position; + } + ### + + #print "$Number#$Map#$Name#$Chr#$Position#$GeneAT#$FunctionAT\n"; +} +close (IF1); + +# my @key = keys(%chr); +# for (my $i=0;$i<=$#key;$i++){ + # print $key[$i],"\n"; +# } + +while (my $line=){ + my @cols = split (/\s+/,$line); + for (my $i=0;$i<=$#cols;$i++){ + my $current = $cols[$i]; + chomp($current); + if ($current !~ /^\s+$/){ + push(@list_marquer,$current); + } + } +} +close (IF2); + +my %coord_by_chr; +for (my $i=0;$i<=$#list_marquer;$i++){ + my $current_name = $list_marquer[$i]; + my $current_chr = $chr{$current_name}; + my $current_position = $position{$current_name}; + + if ($current_position =~ /^\d+$/){ + my @tbl_coord_for_current_chr; + if ($coord_by_chr{$current_chr}){ + @tbl_coord_for_current_chr = @{$coord_by_chr{$current_chr}}; + } + push(@tbl_coord_for_current_chr,$current_position); + $coord_by_chr{$current_chr}=\@tbl_coord_for_current_chr; + } + elsif (($current_position eq "-")||($current_position =~/none/i)){ + + } + else { + chomp($current_position); + #$current_position =~ s/\s+//g; + print STDERR "Error Parsing $current_name\tposition not recognized : $current_position \n"; + print $list_marquer[$i],"\n"; + #exit(0); + } +} + +# foreach my $key (keys %coord_by_chr){ + # my @tbl_coord = @{$coord_by_chr{$key}}; + # print "\n$key\n"; + # @tbl_coord = sort { $a <=> $b } @tbl_coord; + # for (my $i=0;$i<=$#tbl_coord;$i++){ + # print $tbl_coord[$i],"\n"; + # } +# } + +foreach my $key (sort keys %coord_by_chr){ + my @tbl_coord = @{$coord_by_chr{$key}}; + # print "TEST : $key\n"; + @tbl_coord = sort { $a <=> $b } @tbl_coord; + my $current_start; + my $current_stop; + my $current_start_offset; + my $current_stop_offset; + + + for (my $i=0;$i<=$#tbl_coord;$i++){ + if (!$current_start){$current_start=$tbl_coord[$i];$current_stop=$tbl_coord[$i]} + + # print "$i : $current_start / $current_stop\n"; + if ($tbl_coord[$i]>$current_stop+$WINDOW){ + #OFFSET + if ($current_start>$OFFSET){$current_start_offset=$current_start-$OFFSET;}else{$current_start_offset=1;} + $current_stop_offset = $current_stop + $OFFSET; + ####### + print $key,":",$current_start_offset,"..",$current_stop_offset,"\n"; + + $current_start = $tbl_coord[$i]; + $current_stop = $tbl_coord[$i]; + + if ($i==$#tbl_coord){ + #OFFSET + if ($current_start>$OFFSET){$current_start_offset=$current_start-$OFFSET;}else{$current_start_offset=1;} + $current_stop_offset = $current_stop + $OFFSET; + ####### + print $key,":",$current_start_offset,"..",$current_stop_offset,"\n"; + } + } + else { + $current_stop=$tbl_coord[$i]; + if ($i==$#tbl_coord){ + #OFFSET + if ($current_start>$OFFSET){$current_start_offset=$current_start-$OFFSET;}else{$current_start_offset=1;} + $current_stop_offset = $current_stop + $OFFSET; + ####### + print $key,":",$current_start_offset,"..",$current_stop_offset,"\n"; + } + } + } +} +#Traitement du dernier + +# if ($#tbl_coord == 0){ + # print $key,":",$tbl_coord[$i],"\n"; +# } +# else { + # if ($i==0){ + # push (@current_table,$tbl_coord[$i]); + # } + # else { + # if ($tbl_coord[$i]>$current_table[$#current_table]+$WINDOW){ + # print $key,":",$current_table[0],":",$current_table[$#current_table],"\n"; + # undef @current_table; + # push (@current_table,$tbl_coord[$i]); + # } + # else { + # push (@current_table,$tbl_coord[$i]); + # } + # } +# } + + +# print "\n"; +# foreach my $key (keys %coord_by_chr){ + # print "\n$key\n"; + # @tbl_coord = sort { $a <=> $b } @tbl_coord; + # for (my $i=0;$i<=$#tbl_coord;$i++){ + # print $tbl_coord[$i],"\n"; + # } +# } diff -r 000000000000 -r 448b10ffb095 genephys/extractgenomicsegment.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genephys/extractgenomicsegment.xml Tue Aug 12 05:49:33 2014 -0400 @@ -0,0 +1,21 @@ + +Extract the coordinate of genomic segment containing the genetic markers + + extractgenomicsegment.pl $input_geneticmap $input_markers $window $offset > $output_file + + + + + + + + + + + + + + + + + diff -r 000000000000 -r 448b10ffb095 genephys/extractgenomicsequencefromsegment.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genephys/extractgenomicsequencefromsegment.pl Tue Aug 12 05:49:33 2014 -0400 @@ -0,0 +1,90 @@ +#!/usr/bin/perl +#V1.10 +my $inputsegment = $ARGV[0]; +my $inputfasta = $ARGV[1]; + +open(IS, $inputsegment) or die ("Can't open $inputsegment\n"); +open(IF, $inputfasta) or die ("Can't open $inputfasta\n"); + + +my @header; +my @start; +my @end; +my @segment_header; + +while (my $ligne = ){ + if ($ligne=~/(.*?):(\d+)\.+(\d+)/){ + push (@header,$1); + push (@start,$2); + push (@end,$3); + push (@segment_header,$1.":".$2."..".$3); + } +} + +close (IS); + +#print "TEST : $#header\n"; + +my %genome; + +my $current_header; +my $current_seq=""; +while (my $ligne = ){ + if ($ligne =~ /^\>(.*?)\s*$/){ + if ($current_header){ + $genome{$current_header} = $current_seq; + } + + # my $length = length($current_seq); + # print "TEST : $current_header\t$length\n"; + # print "TEST : $current_header\n"; + $current_header=$1; + $current_seq = ""; + $current_position=0; + } + else { + if ($ligne=~/^([ATGCNXatgcnx]+)\s*$/){ + $current_seq .= $1; + } + else { + print STDERR "Erreur Parsing n°1\n$ligne\n"; + } + } +} + +#TRAITEMENT DU DERNIER +if ($current_header){ + $genome{$current_header} = $current_seq; + undef($current_seq); +} + +# foreach my $key (keys %genome){ + # print $key,"\t",length($genome{$key}),"\n"; +# } + +for (my $i=0;$i<=$#header;$i++){ + my $compt=0; + my $current_seq=""; + print ">",$header[$i],":",$start[$i],"..",$end[$i],"\n"; + ### Modification 1.10 + if ($end[$i]>length($genome{$header[$i]})){ + $end[$i] = length($genome{$header[$i]}); + } + ### + + my @SEQ = split(//,$genome{$header[$i]}); + for (my $coord = $start[$i]-1; $coord<=$end[$i]-1;$coord++){ + $compt++; + # print "TEST : $compt\n"; + if ($compt > 60 ){ + $current_seq .= "\n"; + $compt=1; + } + $current_seq .= $SEQ[$coord]; + + } + print "$current_seq\n"; +} + +close (IF); + diff -r 000000000000 -r 448b10ffb095 genephys/extractgenomicsequencefromsegment.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genephys/extractgenomicsequencefromsegment.xml Tue Aug 12 05:49:33 2014 -0400 @@ -0,0 +1,19 @@ + +Extract the genomic sequence corresponding to a genomic segment (format : chr:start..stop) + + extractgenomicsequencefromsegment.pl $input_segment $input_assembly > $output_file + + + + + + + + + + + + + + + diff -r 000000000000 -r 448b10ffb095 genephys/fastaGroomerForMakeBlastdb.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genephys/fastaGroomerForMakeBlastdb.pl Tue Aug 12 05:49:33 2014 -0400 @@ -0,0 +1,22 @@ +#!/usr/bin/perl +my $inputfasta = $ARGV[0]; + +open(IB, $inputfasta) or die ("Can't open $inputfasta \n"); + +while (my $ligne = ){ + if ($ligne=~/\[.*?\=.*?\]/){ + $ligne =~ s/[\[\]]//g; + print $ligne; + } + elsif ($ligne =~/^\>/){ + print $ligne; + $ligne = ; + $ligne =~ s/N/a/g; + print $ligne; + } + else { + print $ligne; + } +} + +close (IB); diff -r 000000000000 -r 448b10ffb095 genephys/fastaGroomerForMakeBlastdb.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genephys/fastaGroomerForMakeBlastdb.xml Tue Aug 12 05:49:33 2014 -0400 @@ -0,0 +1,20 @@ + +fasta Groomer For MakeBlastdb + + fastaGroomerForMakeBlastdb.pl $input_fasta > $output_fasta + + + + + + + + + +MakeBlastDb has several warning handled has error which is problematic on galaxyn This groomer ensure that the fasta file will not generate this warnings / error : + +- First line of the sequence is more than 40% of N => replace all the 'N' of the first line by 'a' (not elegant but low impact and efficient) + +- Header line should not have [xxx=xxx] kind of text because MakeBlastdb try to interpret it => remove the [] when this feature is detected + + diff -r 000000000000 -r 448b10ffb095 genephys/mergeAllBestBlast.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genephys/mergeAllBestBlast.pl Tue Aug 12 05:49:33 2014 -0400 @@ -0,0 +1,47 @@ +#!/usr/bin/perl +my $inputblastn = $ARGV[0]; +my $inputtblastx = $ARGV[1]; +my $inputblastx = $ARGV[2]; +my $inputblastp = $ARGV[3]; + +open(IN, $inputblastn) or die ("Can't open $inputblastn \n"); +open(ITX, $inputtblastx) or die ("Can't open $inputtblastx \n"); +open(IX, $inputblastx) or die ("Can't open $inputblastx \n"); +open(IP, $inputblastp) or die ("Can't open $inputblastp \n"); + +my %blastx; +my %tblastx; +my %blastp; + +while (my $ligne = ){ + my @fields = split (/\t/,$ligne); + chomp($ligne); + $tblastx{$fields[0]} = $ligne; +} +close (ITX); + +while (my $ligne = ){ + my @fields = split (/\t/,$ligne); + chomp($ligne); + $blastx{$fields[0]} = $ligne; +} +close (IX); + +while (my $ligne = ){ + my @fields = split (/\t/,$ligne); + chomp($ligne); + $blastp{$fields[0]} = $ligne; +} +close (IP); + + +while (my $ligne = ){ + my @fields = split (/\t/,$ligne); + my $query = $fields[0]; + print "BLASTN\t$ligne"; + print "TBLASTX\t",$tblastx{$query},"\n"; + print "BLASTX\t",$blastx{$query},"\n"; + print "BLASTP\t",$blastp{$query},"\n\n"; + +} +close (IN); diff -r 000000000000 -r 448b10ffb095 genephys/mergeAllBestBlast.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genephys/mergeAllBestBlast.xml Tue Aug 12 05:49:33 2014 -0400 @@ -0,0 +1,19 @@ + +Merge best results from Blast + + mergeAllBestBlast.pl $input_blastn $input_tblastx $input_blastx $input_blastp > $output_results + + + + + + + + + + + + + + + diff -r 000000000000 -r 448b10ffb095 genephys/parseblasttab.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genephys/parseblasttab.pl Tue Aug 12 05:49:33 2014 -0400 @@ -0,0 +1,511 @@ +#!/usr/bin/perl +my $inputblast = $ARGV[0]; +my $outputjoin = $ARGV[1]; +my $outputbest = $ARGV[2]; +open(IB, $inputblast) or die ("Can't open $inputblast \n"); +open (OJ, ">$outputjoin") or die ("Can't open $outputjoin \n"); +open (OB, ">$outputbest") or die ("Can't open $outputbest \n"); + +my %all_match; +my @all_match_joined; + + +my $MAX_OVERLAP_FRACTION = 0.5; +my $MAX_OVERLAP_LENGTH_IGNORED = 3; + + +while (my $ligne = ){ + my @fields = split (/\t/,$ligne); + my %match; + $match{"Query"}=$fields[0]; + $match{"Subject_id"}=$fields[1]; + $match{"Subject_start"}=$fields[8]; + $match{"Subject_end"}=$fields[9]; + $match{"Similarity"}=$fields[13]; + $match{"Query_length"}=$fields[14]; + $match{"Subject_length"}=$fields[15]; + $match{"Subject"}=$fields[16]; + + if ($fields[6]<=$fields[7]){ + $match{"Query_start"}=$fields[6]; + $match{"Query_end"}=$fields[7]; + $match{"Orientation"}="+"; + #print "+ $ligne"; + } + else { + $match{"Query_start"}=$fields[7]; + $match{"Query_end"}=$fields[6]; + $match{"Orientation"}="-"; + #print "- $ligne"; + } + + if ($fields[9]<=$fields[8]){ + $match{"Subject_start"}=$fields[9]; + $match{"Subject_end"}=$fields[8]; + $match{"Orientation"}="+"; + #print "+ $ligne"; + } + else { + $match{"Subject_start"}=$fields[8]; + $match{"Subject_end"}=$fields[9]; + $match{"Orientation"}="-"; + #print "- $ligne"; + } + + $match{"Ligne"}=$ligne; + my $key = $match{"Query"}."##".$match{"Subject"}."##".$match{"Orientation"}; + if ($match{"Subject_length"}==0){ + print $ligne,"\n",$match{"Subject_length"},"\n"; + } + my @match_table; + + if ($all_match{$key}){ + @match_table = @{$all_match{$key}}; + } + push (@match_table,\%match); + $all_match{$key} = \@match_table; +} + +foreach my $key (keys %all_match){ + my @match_table = @{$all_match{$key}}; + #### Sort + @match_table = sort mysort @match_table; + + + my @duplicate; + my @overlap; + for (my $i=0;$i<=$#match_table;$i++){ + push (@duplicate,0); + } + print "\nTable Match ($#match_table)\n"; + for (my $i=0;$i<=$#match_table;$i++){ + my %match=%{$match_table[$i]}; + print $match{"Query"},"\t",$match{"Subject_id"},"\t",$match{"Orientation"},"\t",$match{"Query_start"},"\t",$match{"Query_end"},"\t"; + print $match{"Subject_start"},"\t",$match{"Subject_end"},"\t",$match{"Subject_length"},"\t",$match{"Similarity"},"\n"; + } + + #Scan d'inclusion strict + for (my $i=0;$i<=$#match_table;$i++){ + my %match1=%{$match_table[$i]}; + for (my $j=0;$j<=$#match_table;$j++){ + if (($j != $i)&&($duplicate[$j]==0)){ # On scan dans les deux sens, pas seuelment $j = $i+1 a cause du last; + my %match2=%{$match_table[$j]}; + # Inclus Subject + if (($match1{"Subject_start"}>=$match2{"Subject_start"})&&($match1{"Subject_end"}<=$match2{"Subject_end"})) + { + $duplicate[$i]=1; + # print $i," : 1 : ",$match1{"Query"},"\t",$match1{"Subject_id"},"\t",$match1{"Query_start"},"\t",$match1{"Query_end"},"\t",$match1{"Subject_start"},"\t",$match1{"Subject_end"},"\n"; + # print $j," : 1 : ",$match2{"Query"},"\t",$match2{"Subject_id"},"\t",$match2{"Query_start"},"\t",$match2{"Query_end"},"\t",$match2{"Subject_start"},"\t",$match2{"Subject_end"},"\n"; + + last; + } + # Inclus Query + elsif (($match1{"Query_start"}>=$match2{"Query_start"})&&($match1{"Query_end"}<=$match2{"Query_end"})) + { + $duplicate[$i]=2; + # print $i," : 2 : ",$match1{"Query"},"\t",$match1{"Subject_id"},"\t",$match1{"Query_start"},"\t",$match1{"Query_end"},"\t",$match1{"Subject_start"},"\t",$match1{"Subject_end"},"\n"; + # print $j," : 2 : ",$match2{"Query"},"\t",$match2{"Subject_id"},"\t",$match2{"Query_start"},"\t",$match2{"Query_end"},"\t",$match2{"Subject_start"},"\t",$match2{"Subject_end"},"\n"; + last; + } + + } + } + } + + my @match_table_filtered; + for (my $i=0;$i<=$#match_table;$i++){ + if ($duplicate[$i] == 0){ + push (@match_table_filtered,$match_table[$i]); + } + } + + if ($#match_table > $#match_table_filtered){ + @match_table = @match_table_filtered; + print "Table Match filtered 1 ($#match_table)\n"; + for (my $i=0;$i<=$#match_table;$i++){ + my %match=%{$match_table[$i]}; + print $match{"Query"},"\t",$match{"Subject_id"},"\t",$match{"Orientation"},"\t",$match{"Query_start"},"\t",$match{"Query_end"},"\t"; + print $match{"Subject_start"},"\t",$match{"Subject_end"},"\t",$match{"Subject_length"},"\t",$match{"Similarity"},"\n"; + #print $match{"Query"},"\t",$match{"Subject_id"},"\t",$match{"Orientation"},"\t",$match{"Query_start"},"\t",$match{"Query_end"},"\t",$match{"Subject_start"},"\t",$match{"Subject_end"},"\n"; + } + } + + undef @duplicate; + for (my $i=0;$i<=$#match_table;$i++){ + push (@duplicate,0); + } + ######## + + ###Scan d'overlap trop important + # D'abord subject + for (my $i=0;$i<=$#match_table;$i++){ + my %match1=%{$match_table[$i]}; + for (my $j=$i+1;$j<=$#match_table;$j++){ + my %match2=%{$match_table[$j]}; + if (($match1{"Subject_start"}<=$match2{"Subject_start"})&&($match1{"Subject_end"}>=$match2{"Subject_start"})&&($match1{"Subject_end"}<=$match2{"Subject_end"})) + { + my $overlap_length = $match1{"Subject_end"} - $match2{"Subject_start"}+1; + my $length1 = $match1{"Subject_end"} - $match1{"Subject_start"} +1; + my $length2 = $match2{"Subject_end"} - $match2{"Subject_start"} +1; + + if (($length1 >= $length2)&&($overlap_length > $length2 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){ + $duplicate[$j]=1; + # print "DUPLICATE : $j\n"; + } + elsif (($length2 >= $length1)&&($overlap_length > $length1 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){ + $duplicate[$i]=1; + # print "DUPLICATE : $i\n"; + } + else { + } + + # print "$i : 3 : $overlap_length : $length1\n"; + # print "$j : 3 : $overlap_length : $length2\n"; + } + elsif (($match2{"Subject_start"}<=$match1{"Subject_start"})&&($match2{"Subject_end"}>=$match1{"Subject_start"})&&($match2{"Subject_end"}<=$match1{"Subject_end"})) + { + my $overlap_length = $match2{"Subject_end"} - $match1{"Subject_start"}+1; + my $length1 = $match1{"Subject_end"} - $match1{"Subject_start"} +1; + my $length2 = $match2{"Subject_end"} - $match2{"Subject_start"} +1; + + if (($length1 >= $length2)&&($overlap_length > $length2 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){ + $duplicate[$j]=1; + # print "DUPLICATE : $j\n"; + # print "($length1 >= $length2)&&($overlap_length > $length2 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)\n"; + } + elsif (($length2 >= $length1)&&($overlap_length > $length1 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){ + $duplicate[$i]=1; + # print "DUPLICATE : $i\n"; + # print "($length2 >= $length1)&&($overlap_length > $length1 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)\n"; + } + else { + } + + # print "((",$match2{"Subject_start"},"<=",$match1{"Subject_start"},")&&(",$match2{"Subject_end"},">=",$match1{"Subject_start"},")&&(",$match2{"Subject_end"},"<=",$match1{"Subject_end"},"))\n"; + # print "$i : 4 : $overlap_length : $length1\n"; + # print "$j : 4 : $overlap_length : $length2\n"; + } + } + } + + undef @match_table_filtered; + for (my $i=0;$i<=$#match_table;$i++){ + if ($duplicate[$i] == 0){ + push (@match_table_filtered,$match_table[$i]); + } + } + + if ($#match_table > $#match_table_filtered){ + @match_table = @match_table_filtered; + print "Table Match filtered 2 ($#match_table)\n"; + for (my $i=0;$i<=$#match_table;$i++){ + my %match=%{$match_table[$i]}; + print $match{"Query"},"\t",$match{"Subject_id"},"\t",$match{"Orientation"},"\t",$match{"Query_start"},"\t",$match{"Query_end"},"\t"; + print $match{"Subject_start"},"\t",$match{"Subject_end"},"\t",$match{"Subject_length"},"\t",$match{"Similarity"},"\n"; + #print $match{"Query"},"\t",$match{"Subject_id"},"\t",$match{"Orientation"},"\t",$match{"Query_start"},"\t",$match{"Query_end"},"\t",$match{"Subject_start"},"\t",$match{"Subject_end"},"\n"; + } + } + + undef @duplicate; + for (my $i=0;$i<=$#match_table;$i++){ + push (@duplicate,0); + } + + # Ensuite Query (Subject puis Query pour evitez des deletions complementaires query update qui enleverait toutes les entrées) + + for (my $i=0;$i<=$#match_table;$i++){ + my %match1=%{$match_table[$i]}; + for (my $j=$i+1;$j<=$#match_table;$j++){ + my %match2=%{$match_table[$j]}; + if (($match1{"Query_start"}<=$match2{"Query_start"})&&($match1{"Query_end"}>=$match2{"Query_start"})&&($match1{"Query_end"}<=$match2{"Query_end"})) + { + my $overlap_length = $match1{"Query_end"} - $match2{"Query_start"}+1; + my $length1 = $match1{"Query_end"} - $match1{"Query_start"} +1; + my $length2 = $match2{"Query_end"} - $match2{"Query_start"} +1; + + if (($length1 >= $length2)&&($overlap_length > $length2 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){ + $duplicate[$j]=1; + # print "DUPLICATE : $j\n"; + } + elsif (($length2 >= $length1)&&($overlap_length > $length1 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){ + $duplicate[$i]=1; + # print "DUPLICATE : $i\n"; + } + else { + } + + # print "$i : 5 : $overlap_length : $length1\n"; + # print "$j : 5 : $overlap_length : $length2\n"; + } + elsif (($match2{"Query_start"}<=$match1{"Query_start"})&&($match2{"Query_end"}>=$match1{"Query_start"})&&($match2{"Query_end"}<=$match1{"Query_end"})) + { + my $overlap_length = $match2{"Query_end"} - $match1{"Query_start"}+1; + my $length1 = $match1{"Query_end"} - $match1{"Query_start"} +1; + my $length2 = $match2{"Query_end"} - $match2{"Query_start"} +1; + + if (($length1 >= $length2)&&($overlap_length > $length2 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){ + $duplicate[$j]=1; + # print "DUPLICATE : $j\n"; + } + elsif (($length2 >= $length1)&&($overlap_length > $length1 * $MAX_OVERLAP_FRACTION)&&($overlap_length > $MAX_OVERLAP_LENGTH_IGNORED)){ + $duplicate[$i]=1; + # print "DUPLICATE : $i\n"; + } + else { + } + + # print "$i : 6 : $overlap_length : $length1\n"; + # print "$j : 6 : $overlap_length : $length2\n"; + } + } + } + + undef @match_table_filtered; + for (my $i=0;$i<=$#match_table;$i++){ + if ($duplicate[$i] == 0){ + push (@match_table_filtered,$match_table[$i]); + } + } + + if ($#match_table > $#match_table_filtered){ + @match_table = @match_table_filtered; + print "Table Match filtered 3 ($#match_table)\n"; + for (my $i=0;$i<=$#match_table;$i++){ + my %match=%{$match_table[$i]}; + print $match{"Query"},"\t",$match{"Subject_id"},"\t",$match{"Orientation"},"\t",$match{"Query_start"},"\t",$match{"Query_end"},"\t"; + print $match{"Subject_start"},"\t",$match{"Subject_end"},"\t",$match{"Subject_length"},"\t",$match{"Similarity"},"\n"; + # print $match{"Query"},"\t",$match{"Subject_id"},"\t",$match{"Orientation"},"\t",$match{"Query_start"},"\t",$match{"Query_end"},"\t",$match{"Subject_start"},"\t",$match{"Subject_end"},"\n"; + } + } + + + #Calcul des nouvelles metriques + + my $overlap_length = 0; + my $min_query = 0; + my $max_query = 0; + my $min_subject = 0; + my $max_subject = 0; + my $Subject_coverage; + my $Identity; + my $nb_match = 0; + + + my $nb_covered_subject=0; + my $nb_covered_query=0; + my %match=%{$match_table[0]}; + my $q_length = $match{"Query_length"}; + my $sub_length = $match{"Subject_length"}; + my $subject = $match{"Subject"}; + my $min_query = $match{"Query_start"}; + my $max_query = $match{"Query_end"}; + my $min_subject = $match{"Subject_start"}; + my $max_subject = $match{"Subject_end"}; + my $orientation = $match{"Orientation"}; + my $Query = $match{"Query"}; + my $Subject_Id = $match{"Subject_id"}; + my $Subject = $match{"Subject"}; + + + for (my $i=0;$i<=$#match_table;$i++){ + my %match1=%{$match_table[$i]}; + $nb_covered_subject += $match1{"Subject_end"} - $match1{"Subject_start"} +1; + $nb_covered_query += $match1{"Query_end"} - $match1{"Query_start"} +1; + + if ($match1{"Query_start"}<$min_query){ + $min_query = $match1{"Query_start"}; + } + if ($match1{"Query_end"}>$max_query){ + $max_query = $match1{"Query_end"}; + } + if ($match1{"Subject_start"}<$min_subject){ + $min_subject = $match1{"Subject_start"}; + } + if ($match1{"Subject_end"}>$max_subject){ + $max_subject = $match1{"Subject_end"}; + } + + $nb_match += $match1{"Similarity"}; + # print "TEST : ",$match1{"Similarity"},"\t",$nb_match,"\n"; + + for (my $j=$i+1;$j<=$#match_table;$j++){ + my $overlap_query=0; + my $overlap_subject=0; + my %match2=%{$match_table[$j]}; + + if (($match1{"Subject_start"}<=$match2{"Subject_start"})&&($match1{"Subject_end"}>=$match2{"Subject_start"})&&($match1{"Subject_end"}<=$match2{"Subject_end"})) + { + $overlap_subject = $match1{"Subject_end"} - $match2{"Subject_start"}+1; + # print "$i : $j : 3 : $overlap_subject\n"; + } + elsif (($match2{"Subject_start"}<=$match1{"Subject_start"})&&($match2{"Subject_end"}>=$match1{"Subject_start"})&&($match2{"Subject_end"}<=$match1{"Subject_end"})) + { + $overlap_subject = $match2{"Subject_end"} - $match1{"Subject_start"}+1; + # print "$i : $j : 4 : $overlap_subject\n"; + } + + if (($match1{"Query_start"}<=$match2{"Query_start"})&&($match1{"Query_end"}>=$match2{"Query_start"})&&($match1{"Query_end"}<=$match2{"Query_end"})) + { + $overlap_query = $match1{"Query_end"} - $match2{"Query_start"} +1; + # print "$i : $j : 5 : $overlap_query\n"; + } + elsif (($match2{"Query_start"}<=$match1{"Query_start"})&&($match2{"Query_end"}>=$match1{"Query_start"})&&($match2{"Query_end"}<=$match1{"Query_end"})) + { + $overlap_query = $match2{"Query_end"} - $match1{"Query_start"}+1; + # print "$i : $j : 6 : $overlap_query\n"; + } + + if ($overlap_query > $overlap_subject){ + $overlap_length += $overlap_query; + } + else { + $overlap_length += $overlap_subject; + } + + $nb_covered_subject -= $overlap_subject; + $nb_covered_query-= $overlap_query; + + } + + } + ### Cas des tblastx ou le nb match est en aa, et les nb_covered en base + if ($nb_match/$nb_covered_subject<0.34){ + $nb_match = $nb_match *3; + } + #### + + $Identity = sprintf("%.2f",($nb_match-$overlap_length)*100/$nb_covered_subject); + + $Subject_coverage = sprintf("%.2f",$nb_covered_subject*100/$sub_length); + $Query_coverage = sprintf("%.2f",$nb_covered_query*100/$q_length); + + print "Final\n"; + print $Query,"\t",$Subject_Id,"\t",$orientation,"\t",$min_query,"\t",$max_query,"\t",$min_subject,"\t",$max_subject,"\t",$sub_length,"\t"; + print "NB:",$nb_match,"\t","O:",$overlap_length,"\t","CQ:",$nb_covered_query,"\t","CS:",$nb_covered_subject,"\t",$Query_coverage,"\t",$Subject_coverage,"\t",$Identity,"\n"; + + if ($subject=~/^(.*?)\s*$/){ + $subject = $1; + } + + my %match_joined; + $match_joined{"Query"}=$Query; + $match_joined{"Query_start"}=$min_query; + $match_joined{"Query_end"}=$max_query; + $match_joined{"Query_length"}=$q_length; + $match_joined{"QCoverage"} = $Query_coverage; + $match_joined{"Subject_id"}=$Subject_Id; + $match_joined{"Subject"}=$subject; + $match_joined{"Subject_start"}=$min_subject; + $match_joined{"Subject_end"}=$max_subject; + $match_joined{"Subject_length"}=$sub_length; + $match_joined{"SCoverage"} = $Subject_coverage; + $match_joined{"Similarity"}=$Identity; + $match_joined{"Nbmatch"}=$nb_match-$overlap_length; + $match_joined{"Display"}="$Query\t$Subject_Id\t$orientation\t$Query_coverage%\t$Subject_coverage%\t$Identity%\t$min_query\t$max_query\t$min_subject\t$max_subject\t$q_length\t$sub_length\t$subject"; + + my $chr; + my $start; + my $end; + + if ($match_joined{"Query"}=~/(.*?)\:(\d+)[\.]+(\d+)/){ + $chr =$1; + $start = $2; + $end = $3; + + } + else { + print "Error Parsing Query : ",$match_joined{"Query"},"\n"; + exit(0); + } + + my $subid = $match_joined{"Subject_id"}; + my $nb = $nb_match-$overlap_length; + + my $key = "$chr#$start#$end#$nb#$subid"; + $all_match_joined{$key} = \%match_joined; + + + # my %match_joined; + # my $nb_covered=0; + # my $length=0; + # for (my $i=0;$i<=$#match_table;$i++){ + # my %match=%{$match_table[$i]}; + # $nb_covered+=$match{"Similarity"}; + # $length = $match{"Subject_length"} + # } + # # if ($match{"Subject_length"} == 0){ + # # print $key,"\n",$match{"Ligne"},"\n",$match{"Subject"},"\n"; + # # exit(0); + # # } + # my $similarity = sprintf("%.2f",$nb_covered / $length); + + # print "TEST : ",$key,"\t",$similarity,"\t",$nb_covered,"\t",$length,"\n"; + + + # if ($similarity > 1){ + # for (my $i=0;$i<=$#match_table;$i++){ + # my %match=%{$match_table[$i]}; + # print "----- : ",$match{"Ligne"},"\n"; + # exit(0); + # } + # } + # $match_joined{"Query"}=$match{"Query"}; + # $match_joined{"Subject"}=$match{"Subject"}; + # $match_joined{"Subject_id"}=$match{"Subject_id"}; + # $match_joined{"Similarity"}=$similarity; + # $match_joined{"Query_length"}=$match{"Query_length"}; + # $match_joined{"Subject_length"}=$match{"Subject_length"}; + # push(@all_match_joined,\%match_joined); + +} + +close (IB); + +my %all_match_joined_best; +foreach my $key (sort sortkey keys %all_match_joined){ + my %match = %{$all_match_joined{$key}}; + print OJ $match{"Display"},"\n"; + my $shortkey = $match{"Query"}; + if ($all_match_joined_best{$shortkey}){ + } + else { + $all_match_joined_best{$shortkey} = \%match; + print OB $match{"Display"},"\n"; + } + +} + +# for (my $i=0;$i<=$#all_match_joined;$i++){ + # my $match_joined = %{$all_match_joined[$i]}; + # print $match_joined{"Query"},"\t",$match_joined{"Subject"},"\t",$match_joined{"Subject_id"},"\t",$match_joined{"Similarity"},"\t",$match_joined{"Query_length"},"\t",$match_joined{"Subject_length"},"\n"; +# } + + +sub mysort{ + my %matcha=%{$a}; + my %matchb=%{$b}; + + #print "TEST : ",$matcha{"Query_start"}, " / ", $matchb{"Query_start"},"\n"; + + $matcha{"Query_start"} <=> $matchb{"Query_start"} + || + $matcha{"Query_end"} <=> $matchb{"Query_end"} + +} + +sub sortkey { + my @fieldsa = split (/\#/,$a); + my @fieldsb = split (/\#/,$b); + + #print "$a\n$b\n"; + #print $fieldsa[0]," cmp ",$fieldsb[0],"\n"; + #exit(0); + + $fieldsa[0] cmp $fieldsb[0] + || + $fieldsa[1] <=> $fieldsb[1] + || + $fieldsb[2] <=> $fieldsa[2] + || + $fieldsb[3] <=> $fieldsa[3] +} diff -r 000000000000 -r 448b10ffb095 genephys/parseblasttab.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genephys/parseblasttab.xml Tue Aug 12 05:49:33 2014 -0400 @@ -0,0 +1,17 @@ + +Parse Blast result (Tabular) to merge feature + + parseblasttab.pl $input_blast $output_merge $output_best + + + + + + + + + + + + +