# HG changeset patch # User bzeitouni # Date 1358853622 18000 # Node ID 091714bd75a041f0e185f44fc33dbca57b8069dd # Parent c284618dd8dad9bd647c7884099e2bb9132dd5ee new release r0.8b diff -r c284618dd8da -r 091714bd75a0 SVDetect_r0.8b_galaxy/external/karyotype.human_hg19.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SVDetect_r0.8b_galaxy/external/karyotype.human_hg19.txt Tue Jan 22 06:20:22 2013 -0500 @@ -0,0 +1,888 @@ +# Human chromsome sizes and cytogenetic band position. +# Data based on hg19 assembly. +chr - hs1 1 0 249250621 green +chr - hs2 2 0 243199373 green +chr - hs3 3 0 198022430 green +chr - hs4 4 0 191154276 green +chr - hs5 5 0 180915260 green +chr - hs6 6 0 171115067 green +chr - hs7 7 0 159138663 green +chr - hs8 8 0 146364022 green +chr - hs9 9 0 141213431 green +chr - hs10 10 0 135534747 green +chr - hs11 11 0 135006516 green +chr - hs12 12 0 133851895 green +chr - hs13 13 0 115169878 green +chr - hs14 14 0 107349540 green +chr - hs15 15 0 102531392 green +chr - hs16 16 0 90354753 green +chr - hs17 17 0 81195210 green +chr - hs18 18 0 78077248 green +chr - hs19 19 0 59128983 green +chr - hs20 20 0 63025520 green +chr - hs21 21 0 48129895 green +chr - hs22 22 0 51304566 green +chr - hsX X 0 155270560 green +chr - hsY Y 0 59373566 green +band hs1 p36.33 p36.33 0 2300000 gneg +band hs1 p36.32 p36.32 2300000 5400000 gpos25 +band hs1 p36.31 p36.31 5400000 7200000 gneg +band hs1 p36.23 p36.23 7200000 9200000 gpos25 +band hs1 p36.22 p36.22 9200000 12700000 gneg +band hs1 p36.21 p36.21 12700000 16200000 gpos50 +band hs1 p36.13 p36.13 16200000 20400000 gneg +band hs1 p36.12 p36.12 20400000 23900000 gpos25 +band hs1 p36.11 p36.11 23900000 28000000 gneg +band hs1 p35.3 p35.3 28000000 30200000 gpos25 +band hs1 p35.2 p35.2 30200000 32400000 gneg +band hs1 p35.1 p35.1 32400000 34600000 gpos25 +band hs1 p34.3 p34.3 34600000 40100000 gneg +band hs1 p34.2 p34.2 40100000 44100000 gpos25 +band hs1 p34.1 p34.1 44100000 46800000 gneg +band hs1 p33 p33 46800000 50700000 gpos75 +band hs1 p32.3 p32.3 50700000 56100000 gneg +band hs1 p32.2 p32.2 56100000 59000000 gpos50 +band hs1 p32.1 p32.1 59000000 61300000 gneg +band hs1 p31.3 p31.3 61300000 68900000 gpos50 +band hs1 p31.2 p31.2 68900000 69700000 gneg +band hs1 p31.1 p31.1 69700000 84900000 gpos100 +band hs1 p22.3 p22.3 84900000 88400000 gneg +band hs1 p22.2 p22.2 88400000 92000000 gpos75 +band hs1 p22.1 p22.1 92000000 94700000 gneg +band hs1 p21.3 p21.3 94700000 99700000 gpos75 +band hs1 p21.2 p21.2 99700000 102200000 gneg +band hs1 p21.1 p21.1 102200000 107200000 gpos100 +band hs1 p13.3 p13.3 107200000 111800000 gneg +band hs1 p13.2 p13.2 111800000 116100000 gpos50 +band hs1 p13.1 p13.1 116100000 117800000 gneg +band hs1 p12 p12 117800000 120600000 gpos50 +band hs1 p11.2 p11.2 120600000 121500000 gneg +band hs1 p11.1 p11.1 121500000 125000000 acen +band hs1 q11 q11 125000000 128900000 acen +band hs1 q12 q12 128900000 142600000 gvar +band hs1 q21.1 q21.1 142600000 147000000 gneg +band hs1 q21.2 q21.2 147000000 150300000 gpos50 +band hs1 q21.3 q21.3 150300000 155000000 gneg +band hs1 q22 q22 155000000 156500000 gpos50 +band hs1 q23.1 q23.1 156500000 159100000 gneg +band hs1 q23.2 q23.2 159100000 160500000 gpos50 +band hs1 q23.3 q23.3 160500000 165500000 gneg +band hs1 q24.1 q24.1 165500000 167200000 gpos50 +band hs1 q24.2 q24.2 167200000 170900000 gneg +band hs1 q24.3 q24.3 170900000 172900000 gpos75 +band hs1 q25.1 q25.1 172900000 176000000 gneg +band hs1 q25.2 q25.2 176000000 180300000 gpos50 +band hs1 q25.3 q25.3 180300000 185800000 gneg +band hs1 q31.1 q31.1 185800000 190800000 gpos100 +band hs1 q31.2 q31.2 190800000 193800000 gneg +band hs1 q31.3 q31.3 193800000 198700000 gpos100 +band hs1 q32.1 q32.1 198700000 207200000 gneg +band hs1 q32.2 q32.2 207200000 211500000 gpos25 +band hs1 q32.3 q32.3 211500000 214500000 gneg +band hs1 q41 q41 214500000 224100000 gpos100 +band hs1 q42.11 q42.11 224100000 224600000 gneg +band hs1 q42.12 q42.12 224600000 227000000 gpos25 +band hs1 q42.13 q42.13 227000000 230700000 gneg +band hs1 q42.2 q42.2 230700000 234700000 gpos50 +band hs1 q42.3 q42.3 234700000 236600000 gneg +band hs1 q43 q43 236600000 243700000 gpos75 +band hs1 q44 q44 243700000 249250621 gneg +band hs10 p15.3 p15.3 0 3000000 gneg +band hs10 p15.2 p15.2 3000000 3800000 gpos25 +band hs10 p15.1 p15.1 3800000 6600000 gneg +band hs10 p14 p14 6600000 12200000 gpos75 +band hs10 p13 p13 12200000 17300000 gneg +band hs10 p12.33 p12.33 17300000 18600000 gpos75 +band hs10 p12.32 p12.32 18600000 18700000 gneg +band hs10 p12.31 p12.31 18700000 22600000 gpos75 +band hs10 p12.2 p12.2 22600000 24600000 gneg +band hs10 p12.1 p12.1 24600000 29600000 gpos50 +band hs10 p11.23 p11.23 29600000 31300000 gneg +band hs10 p11.22 p11.22 31300000 34400000 gpos25 +band hs10 p11.21 p11.21 34400000 38000000 gneg +band hs10 p11.1 p11.1 38000000 40200000 acen +band hs10 q11.1 q11.1 40200000 42300000 acen +band hs10 q11.21 q11.21 42300000 46100000 gneg +band hs10 q11.22 q11.22 46100000 49900000 gpos25 +band hs10 q11.23 q11.23 49900000 52900000 gneg +band hs10 q21.1 q21.1 52900000 61200000 gpos100 +band hs10 q21.2 q21.2 61200000 64500000 gneg +band hs10 q21.3 q21.3 64500000 70600000 gpos100 +band hs10 q22.1 q22.1 70600000 74900000 gneg +band hs10 q22.2 q22.2 74900000 77700000 gpos50 +band hs10 q22.3 q22.3 77700000 82000000 gneg +band hs10 q23.1 q23.1 82000000 87900000 gpos100 +band hs10 q23.2 q23.2 87900000 89500000 gneg +band hs10 q23.31 q23.31 89500000 92900000 gpos75 +band hs10 q23.32 q23.32 92900000 94100000 gneg +band hs10 q23.33 q23.33 94100000 97000000 gpos50 +band hs10 q24.1 q24.1 97000000 99300000 gneg +band hs10 q24.2 q24.2 99300000 101900000 gpos50 +band hs10 q24.31 q24.31 101900000 103000000 gneg +band hs10 q24.32 q24.32 103000000 104900000 gpos25 +band hs10 q24.33 q24.33 104900000 105800000 gneg +band hs10 q25.1 q25.1 105800000 111900000 gpos100 +band hs10 q25.2 q25.2 111900000 114900000 gneg +band hs10 q25.3 q25.3 114900000 119100000 gpos75 +band hs10 q26.11 q26.11 119100000 121700000 gneg +band hs10 q26.12 q26.12 121700000 123100000 gpos50 +band hs10 q26.13 q26.13 123100000 127500000 gneg +band hs10 q26.2 q26.2 127500000 130600000 gpos50 +band hs10 q26.3 q26.3 130600000 135534747 gneg +band hs11 p15.5 p15.5 0 2800000 gneg +band hs11 p15.4 p15.4 2800000 10700000 gpos50 +band hs11 p15.3 p15.3 10700000 12700000 gneg +band hs11 p15.2 p15.2 12700000 16200000 gpos50 +band hs11 p15.1 p15.1 16200000 21700000 gneg +band hs11 p14.3 p14.3 21700000 26100000 gpos100 +band hs11 p14.2 p14.2 26100000 27200000 gneg +band hs11 p14.1 p14.1 27200000 31000000 gpos75 +band hs11 p13 p13 31000000 36400000 gneg +band hs11 p12 p12 36400000 43500000 gpos100 +band hs11 p11.2 p11.2 43500000 48800000 gneg +band hs11 p11.12 p11.12 48800000 51600000 gpos75 +band hs11 p11.11 p11.11 51600000 53700000 acen +band hs11 q11 q11 53700000 55700000 acen +band hs11 q12.1 q12.1 55700000 59900000 gpos75 +band hs11 q12.2 q12.2 59900000 61700000 gneg +band hs11 q12.3 q12.3 61700000 63400000 gpos25 +band hs11 q13.1 q13.1 63400000 65900000 gneg +band hs11 q13.2 q13.2 65900000 68400000 gpos25 +band hs11 q13.3 q13.3 68400000 70400000 gneg +band hs11 q13.4 q13.4 70400000 75200000 gpos50 +band hs11 q13.5 q13.5 75200000 77100000 gneg +band hs11 q14.1 q14.1 77100000 85600000 gpos100 +band hs11 q14.2 q14.2 85600000 88300000 gneg +band hs11 q14.3 q14.3 88300000 92800000 gpos100 +band hs11 q21 q21 92800000 97200000 gneg +band hs11 q22.1 q22.1 97200000 102100000 gpos100 +band hs11 q22.2 q22.2 102100000 102900000 gneg +band hs11 q22.3 q22.3 102900000 110400000 gpos100 +band hs11 q23.1 q23.1 110400000 112500000 gneg +band hs11 q23.2 q23.2 112500000 114500000 gpos50 +band hs11 q23.3 q23.3 114500000 121200000 gneg +band hs11 q24.1 q24.1 121200000 123900000 gpos50 +band hs11 q24.2 q24.2 123900000 127800000 gneg +band hs11 q24.3 q24.3 127800000 130800000 gpos50 +band hs11 q25 q25 130800000 135006516 gneg +band hs12 p13.33 p13.33 0 3300000 gneg +band hs12 p13.32 p13.32 3300000 5400000 gpos25 +band hs12 p13.31 p13.31 5400000 10100000 gneg +band hs12 p13.2 p13.2 10100000 12800000 gpos75 +band hs12 p13.1 p13.1 12800000 14800000 gneg +band hs12 p12.3 p12.3 14800000 20000000 gpos100 +band hs12 p12.2 p12.2 20000000 21300000 gneg +band hs12 p12.1 p12.1 21300000 26500000 gpos100 +band hs12 p11.23 p11.23 26500000 27800000 gneg +band hs12 p11.22 p11.22 27800000 30700000 gpos50 +band hs12 p11.21 p11.21 30700000 33300000 gneg +band hs12 p11.1 p11.1 33300000 35800000 acen +band hs12 q11 q11 35800000 38200000 acen +band hs12 q12 q12 38200000 46400000 gpos100 +band hs12 q13.11 q13.11 46400000 49100000 gneg +band hs12 q13.12 q13.12 49100000 51500000 gpos25 +band hs12 q13.13 q13.13 51500000 54900000 gneg +band hs12 q13.2 q13.2 54900000 56600000 gpos25 +band hs12 q13.3 q13.3 56600000 58100000 gneg +band hs12 q14.1 q14.1 58100000 63100000 gpos75 +band hs12 q14.2 q14.2 63100000 65100000 gneg +band hs12 q14.3 q14.3 65100000 67700000 gpos50 +band hs12 q15 q15 67700000 71500000 gneg +band hs12 q21.1 q21.1 71500000 75700000 gpos75 +band hs12 q21.2 q21.2 75700000 80300000 gneg +band hs12 q21.31 q21.31 80300000 86700000 gpos100 +band hs12 q21.32 q21.32 86700000 89000000 gneg +band hs12 q21.33 q21.33 89000000 92600000 gpos100 +band hs12 q22 q22 92600000 96200000 gneg +band hs12 q23.1 q23.1 96200000 101600000 gpos75 +band hs12 q23.2 q23.2 101600000 103800000 gneg +band hs12 q23.3 q23.3 103800000 109000000 gpos50 +band hs12 q24.11 q24.11 109000000 111700000 gneg +band hs12 q24.12 q24.12 111700000 112300000 gpos25 +band hs12 q24.13 q24.13 112300000 114300000 gneg +band hs12 q24.21 q24.21 114300000 116800000 gpos50 +band hs12 q24.22 q24.22 116800000 118100000 gneg +band hs12 q24.23 q24.23 118100000 120700000 gpos50 +band hs12 q24.31 q24.31 120700000 125900000 gneg +band hs12 q24.32 q24.32 125900000 129300000 gpos50 +band hs12 q24.33 q24.33 129300000 133851895 gneg +band hs13 p13 p13 0 4500000 gvar +band hs13 p12 p12 4500000 10000000 stalk +band hs13 p11.2 p11.2 10000000 16300000 gvar +band hs13 p11.1 p11.1 16300000 17900000 acen +band hs13 q11 q11 17900000 19500000 acen +band hs13 q12.11 q12.11 19500000 23300000 gneg +band hs13 q12.12 q12.12 23300000 25500000 gpos25 +band hs13 q12.13 q12.13 25500000 27800000 gneg +band hs13 q12.2 q12.2 27800000 28900000 gpos25 +band hs13 q12.3 q12.3 28900000 32200000 gneg +band hs13 q13.1 q13.1 32200000 34000000 gpos50 +band hs13 q13.2 q13.2 34000000 35500000 gneg +band hs13 q13.3 q13.3 35500000 40100000 gpos75 +band hs13 q14.11 q14.11 40100000 45200000 gneg +band hs13 q14.12 q14.12 45200000 45800000 gpos25 +band hs13 q14.13 q14.13 45800000 47300000 gneg +band hs13 q14.2 q14.2 47300000 50900000 gpos50 +band hs13 q14.3 q14.3 50900000 55300000 gneg +band hs13 q21.1 q21.1 55300000 59600000 gpos100 +band hs13 q21.2 q21.2 59600000 62300000 gneg +band hs13 q21.31 q21.31 62300000 65700000 gpos75 +band hs13 q21.32 q21.32 65700000 68600000 gneg +band hs13 q21.33 q21.33 68600000 73300000 gpos100 +band hs13 q22.1 q22.1 73300000 75400000 gneg +band hs13 q22.2 q22.2 75400000 77200000 gpos50 +band hs13 q22.3 q22.3 77200000 79000000 gneg +band hs13 q31.1 q31.1 79000000 87700000 gpos100 +band hs13 q31.2 q31.2 87700000 90000000 gneg +band hs13 q31.3 q31.3 90000000 95000000 gpos100 +band hs13 q32.1 q32.1 95000000 98200000 gneg +band hs13 q32.2 q32.2 98200000 99300000 gpos25 +band hs13 q32.3 q32.3 99300000 101700000 gneg +band hs13 q33.1 q33.1 101700000 104800000 gpos100 +band hs13 q33.2 q33.2 104800000 107000000 gneg +band hs13 q33.3 q33.3 107000000 110300000 gpos100 +band hs13 q34 q34 110300000 115169878 gneg +band hs14 p13 p13 0 3700000 gvar +band hs14 p12 p12 3700000 8100000 stalk +band hs14 p11.2 p11.2 8100000 16100000 gvar +band hs14 p11.1 p11.1 16100000 17600000 acen +band hs14 q11.1 q11.1 17600000 19100000 acen +band hs14 q11.2 q11.2 19100000 24600000 gneg +band hs14 q12 q12 24600000 33300000 gpos100 +band hs14 q13.1 q13.1 33300000 35300000 gneg +band hs14 q13.2 q13.2 35300000 36600000 gpos50 +band hs14 q13.3 q13.3 36600000 37800000 gneg +band hs14 q21.1 q21.1 37800000 43500000 gpos100 +band hs14 q21.2 q21.2 43500000 47200000 gneg +band hs14 q21.3 q21.3 47200000 50900000 gpos100 +band hs14 q22.1 q22.1 50900000 54100000 gneg +band hs14 q22.2 q22.2 54100000 55500000 gpos25 +band hs14 q22.3 q22.3 55500000 58100000 gneg +band hs14 q23.1 q23.1 58100000 62100000 gpos75 +band hs14 q23.2 q23.2 62100000 64800000 gneg +band hs14 q23.3 q23.3 64800000 67900000 gpos50 +band hs14 q24.1 q24.1 67900000 70200000 gneg +band hs14 q24.2 q24.2 70200000 73800000 gpos50 +band hs14 q24.3 q24.3 73800000 79300000 gneg +band hs14 q31.1 q31.1 79300000 83600000 gpos100 +band hs14 q31.2 q31.2 83600000 84900000 gneg +band hs14 q31.3 q31.3 84900000 89800000 gpos100 +band hs14 q32.11 q32.11 89800000 91900000 gneg +band hs14 q32.12 q32.12 91900000 94700000 gpos25 +band hs14 q32.13 q32.13 94700000 96300000 gneg +band hs14 q32.2 q32.2 96300000 101400000 gpos50 +band hs14 q32.31 q32.31 101400000 103200000 gneg +band hs14 q32.32 q32.32 103200000 104000000 gpos50 +band hs14 q32.33 q32.33 104000000 107349540 gneg +band hs15 p13 p13 0 3900000 gvar +band hs15 p12 p12 3900000 8700000 stalk +band hs15 p11.2 p11.2 8700000 15800000 gvar +band hs15 p11.1 p11.1 15800000 19000000 acen +band hs15 q11.1 q11.1 19000000 20700000 acen +band hs15 q11.2 q11.2 20700000 25700000 gneg +band hs15 q12 q12 25700000 28100000 gpos50 +band hs15 q13.1 q13.1 28100000 30300000 gneg +band hs15 q13.2 q13.2 30300000 31200000 gpos50 +band hs15 q13.3 q13.3 31200000 33600000 gneg +band hs15 q14 q14 33600000 40100000 gpos75 +band hs15 q15.1 q15.1 40100000 42800000 gneg +band hs15 q15.2 q15.2 42800000 43600000 gpos25 +band hs15 q15.3 q15.3 43600000 44800000 gneg +band hs15 q21.1 q21.1 44800000 49500000 gpos75 +band hs15 q21.2 q21.2 49500000 52900000 gneg +band hs15 q21.3 q21.3 52900000 59100000 gpos75 +band hs15 q22.1 q22.1 59100000 59300000 gneg +band hs15 q22.2 q22.2 59300000 63700000 gpos25 +band hs15 q22.31 q22.31 63700000 67200000 gneg +band hs15 q22.32 q22.32 67200000 67300000 gpos25 +band hs15 q22.33 q22.33 67300000 67500000 gneg +band hs15 q23 q23 67500000 72700000 gpos25 +band hs15 q24.1 q24.1 72700000 75200000 gneg +band hs15 q24.2 q24.2 75200000 76600000 gpos25 +band hs15 q24.3 q24.3 76600000 78300000 gneg +band hs15 q25.1 q25.1 78300000 81700000 gpos50 +band hs15 q25.2 q25.2 81700000 85200000 gneg +band hs15 q25.3 q25.3 85200000 89100000 gpos50 +band hs15 q26.1 q26.1 89100000 94300000 gneg +band hs15 q26.2 q26.2 94300000 98500000 gpos50 +band hs15 q26.3 q26.3 98500000 102531392 gneg +band hs16 p13.3 p13.3 0 7900000 gneg +band hs16 p13.2 p13.2 7900000 10500000 gpos50 +band hs16 p13.13 p13.13 10500000 12600000 gneg +band hs16 p13.12 p13.12 12600000 14800000 gpos50 +band hs16 p13.11 p13.11 14800000 16800000 gneg +band hs16 p12.3 p12.3 16800000 21200000 gpos50 +band hs16 p12.2 p12.2 21200000 24200000 gneg +band hs16 p12.1 p12.1 24200000 28100000 gpos50 +band hs16 p11.2 p11.2 28100000 34600000 gneg +band hs16 p11.1 p11.1 34600000 36600000 acen +band hs16 q11.1 q11.1 36600000 38600000 acen +band hs16 q11.2 q11.2 38600000 47000000 gvar +band hs16 q12.1 q12.1 47000000 52600000 gneg +band hs16 q12.2 q12.2 52600000 56700000 gpos50 +band hs16 q13 q13 56700000 57400000 gneg +band hs16 q21 q21 57400000 66700000 gpos100 +band hs16 q22.1 q22.1 66700000 70800000 gneg +band hs16 q22.2 q22.2 70800000 72900000 gpos50 +band hs16 q22.3 q22.3 72900000 74100000 gneg +band hs16 q23.1 q23.1 74100000 79200000 gpos75 +band hs16 q23.2 q23.2 79200000 81700000 gneg +band hs16 q23.3 q23.3 81700000 84200000 gpos50 +band hs16 q24.1 q24.1 84200000 87100000 gneg +band hs16 q24.2 q24.2 87100000 88700000 gpos25 +band hs16 q24.3 q24.3 88700000 90354753 gneg +band hs17 p13.3 p13.3 0 3300000 gneg +band hs17 p13.2 p13.2 3300000 6500000 gpos50 +band hs17 p13.1 p13.1 6500000 10700000 gneg +band hs17 p12 p12 10700000 16000000 gpos75 +band hs17 p11.2 p11.2 16000000 22200000 gneg +band hs17 p11.1 p11.1 22200000 24000000 acen +band hs17 q11.1 q11.1 24000000 25800000 acen +band hs17 q11.2 q11.2 25800000 31800000 gneg +band hs17 q12 q12 31800000 38100000 gpos50 +band hs17 q21.1 q21.1 38100000 38400000 gneg +band hs17 q21.2 q21.2 38400000 40900000 gpos25 +band hs17 q21.31 q21.31 40900000 44900000 gneg +band hs17 q21.32 q21.32 44900000 47400000 gpos25 +band hs17 q21.33 q21.33 47400000 50200000 gneg +band hs17 q22 q22 50200000 57600000 gpos75 +band hs17 q23.1 q23.1 57600000 58300000 gneg +band hs17 q23.2 q23.2 58300000 61100000 gpos75 +band hs17 q23.3 q23.3 61100000 62600000 gneg +band hs17 q24.1 q24.1 62600000 64200000 gpos50 +band hs17 q24.2 q24.2 64200000 67100000 gneg +band hs17 q24.3 q24.3 67100000 70900000 gpos75 +band hs17 q25.1 q25.1 70900000 74800000 gneg +band hs17 q25.2 q25.2 74800000 75300000 gpos25 +band hs17 q25.3 q25.3 75300000 81195210 gneg +band hs18 p11.32 p11.32 0 2900000 gneg +band hs18 p11.31 p11.31 2900000 7100000 gpos50 +band hs18 p11.23 p11.23 7100000 8500000 gneg +band hs18 p11.22 p11.22 8500000 10900000 gpos25 +band hs18 p11.21 p11.21 10900000 15400000 gneg +band hs18 p11.1 p11.1 15400000 17200000 acen +band hs18 q11.1 q11.1 17200000 19000000 acen +band hs18 q11.2 q11.2 19000000 25000000 gneg +band hs18 q12.1 q12.1 25000000 32700000 gpos100 +band hs18 q12.2 q12.2 32700000 37200000 gneg +band hs18 q12.3 q12.3 37200000 43500000 gpos75 +band hs18 q21.1 q21.1 43500000 48200000 gneg +band hs18 q21.2 q21.2 48200000 53800000 gpos75 +band hs18 q21.31 q21.31 53800000 56200000 gneg +band hs18 q21.32 q21.32 56200000 59000000 gpos50 +band hs18 q21.33 q21.33 59000000 61600000 gneg +band hs18 q22.1 q22.1 61600000 66800000 gpos100 +band hs18 q22.2 q22.2 66800000 68700000 gneg +band hs18 q22.3 q22.3 68700000 73100000 gpos25 +band hs18 q23 q23 73100000 78077248 gneg +band hs19 p13.3 p13.3 0 6900000 gneg +band hs19 p13.2 p13.2 6900000 13900000 gpos25 +band hs19 p13.13 p13.13 13900000 14000000 gneg +band hs19 p13.12 p13.12 14000000 16300000 gpos25 +band hs19 p13.11 p13.11 16300000 20000000 gneg +band hs19 p12 p12 20000000 24400000 gvar +band hs19 p11 p11 24400000 26500000 acen +band hs19 q11 q11 26500000 28600000 acen +band hs19 q12 q12 28600000 32400000 gvar +band hs19 q13.11 q13.11 32400000 35500000 gneg +band hs19 q13.12 q13.12 35500000 38300000 gpos25 +band hs19 q13.13 q13.13 38300000 38700000 gneg +band hs19 q13.2 q13.2 38700000 43400000 gpos25 +band hs19 q13.31 q13.31 43400000 45200000 gneg +band hs19 q13.32 q13.32 45200000 48000000 gpos25 +band hs19 q13.33 q13.33 48000000 51400000 gneg +band hs19 q13.41 q13.41 51400000 53600000 gpos25 +band hs19 q13.42 q13.42 53600000 56300000 gneg +band hs19 q13.43 q13.43 56300000 59128983 gpos25 +band hs2 p25.3 p25.3 0 4400000 gneg +band hs2 p25.2 p25.2 4400000 7100000 gpos50 +band hs2 p25.1 p25.1 7100000 12200000 gneg +band hs2 p24.3 p24.3 12200000 16700000 gpos75 +band hs2 p24.2 p24.2 16700000 19200000 gneg +band hs2 p24.1 p24.1 19200000 24000000 gpos75 +band hs2 p23.3 p23.3 24000000 27900000 gneg +band hs2 p23.2 p23.2 27900000 30000000 gpos25 +band hs2 p23.1 p23.1 30000000 32100000 gneg +band hs2 p22.3 p22.3 32100000 36600000 gpos75 +band hs2 p22.2 p22.2 36600000 38600000 gneg +band hs2 p22.1 p22.1 38600000 41800000 gpos50 +band hs2 p21 p21 41800000 47800000 gneg +band hs2 p16.3 p16.3 47800000 52900000 gpos100 +band hs2 p16.2 p16.2 52900000 55000000 gneg +band hs2 p16.1 p16.1 55000000 61300000 gpos100 +band hs2 p15 p15 61300000 64100000 gneg +band hs2 p14 p14 64100000 68600000 gpos50 +band hs2 p13.3 p13.3 68600000 71500000 gneg +band hs2 p13.2 p13.2 71500000 73500000 gpos50 +band hs2 p13.1 p13.1 73500000 75000000 gneg +band hs2 p12 p12 75000000 83300000 gpos100 +band hs2 p11.2 p11.2 83300000 90500000 gneg +band hs2 p11.1 p11.1 90500000 93300000 acen +band hs2 q11.1 q11.1 93300000 96800000 acen +band hs2 q11.2 q11.2 96800000 102700000 gneg +band hs2 q12.1 q12.1 102700000 106000000 gpos50 +band hs2 q12.2 q12.2 106000000 107500000 gneg +band hs2 q12.3 q12.3 107500000 110200000 gpos25 +band hs2 q13 q13 110200000 114400000 gneg +band hs2 q14.1 q14.1 114400000 118800000 gpos50 +band hs2 q14.2 q14.2 118800000 122400000 gneg +band hs2 q14.3 q14.3 122400000 129900000 gpos50 +band hs2 q21.1 q21.1 129900000 132500000 gneg +band hs2 q21.2 q21.2 132500000 135100000 gpos25 +band hs2 q21.3 q21.3 135100000 136800000 gneg +band hs2 q22.1 q22.1 136800000 142200000 gpos100 +band hs2 q22.2 q22.2 142200000 144100000 gneg +band hs2 q22.3 q22.3 144100000 148700000 gpos100 +band hs2 q23.1 q23.1 148700000 149900000 gneg +band hs2 q23.2 q23.2 149900000 150500000 gpos25 +band hs2 q23.3 q23.3 150500000 154900000 gneg +band hs2 q24.1 q24.1 154900000 159800000 gpos75 +band hs2 q24.2 q24.2 159800000 163700000 gneg +band hs2 q24.3 q24.3 163700000 169700000 gpos75 +band hs2 q31.1 q31.1 169700000 178000000 gneg +band hs2 q31.2 q31.2 178000000 180600000 gpos50 +band hs2 q31.3 q31.3 180600000 183000000 gneg +band hs2 q32.1 q32.1 183000000 189400000 gpos75 +band hs2 q32.2 q32.2 189400000 191900000 gneg +band hs2 q32.3 q32.3 191900000 197400000 gpos75 +band hs2 q33.1 q33.1 197400000 203300000 gneg +band hs2 q33.2 q33.2 203300000 204900000 gpos50 +band hs2 q33.3 q33.3 204900000 209000000 gneg +band hs2 q34 q34 209000000 215300000 gpos100 +band hs2 q35 q35 215300000 221500000 gneg +band hs2 q36.1 q36.1 221500000 225200000 gpos75 +band hs2 q36.2 q36.2 225200000 226100000 gneg +band hs2 q36.3 q36.3 226100000 231000000 gpos100 +band hs2 q37.1 q37.1 231000000 235600000 gneg +band hs2 q37.2 q37.2 235600000 237300000 gpos50 +band hs2 q37.3 q37.3 237300000 243199373 gneg +band hs20 p13 p13 0 5100000 gneg +band hs20 p12.3 p12.3 5100000 9200000 gpos75 +band hs20 p12.2 p12.2 9200000 12100000 gneg +band hs20 p12.1 p12.1 12100000 17900000 gpos75 +band hs20 p11.23 p11.23 17900000 21300000 gneg +band hs20 p11.22 p11.22 21300000 22300000 gpos25 +band hs20 p11.21 p11.21 22300000 25600000 gneg +band hs20 p11.1 p11.1 25600000 27500000 acen +band hs20 q11.1 q11.1 27500000 29400000 acen +band hs20 q11.21 q11.21 29400000 32100000 gneg +band hs20 q11.22 q11.22 32100000 34400000 gpos25 +band hs20 q11.23 q11.23 34400000 37600000 gneg +band hs20 q12 q12 37600000 41700000 gpos75 +band hs20 q13.11 q13.11 41700000 42100000 gneg +band hs20 q13.12 q13.12 42100000 46400000 gpos25 +band hs20 q13.13 q13.13 46400000 49800000 gneg +band hs20 q13.2 q13.2 49800000 55000000 gpos75 +band hs20 q13.31 q13.31 55000000 56500000 gneg +band hs20 q13.32 q13.32 56500000 58400000 gpos50 +band hs20 q13.33 q13.33 58400000 63025520 gneg +band hs21 p13 p13 0 2800000 gvar +band hs21 p12 p12 2800000 6800000 stalk +band hs21 p11.2 p11.2 6800000 10900000 gvar +band hs21 p11.1 p11.1 10900000 13200000 acen +band hs21 q11.1 q11.1 13200000 14300000 acen +band hs21 q11.2 q11.2 14300000 16400000 gneg +band hs21 q21.1 q21.1 16400000 24000000 gpos100 +band hs21 q21.2 q21.2 24000000 26800000 gneg +band hs21 q21.3 q21.3 26800000 31500000 gpos75 +band hs21 q22.11 q22.11 31500000 35800000 gneg +band hs21 q22.12 q22.12 35800000 37800000 gpos50 +band hs21 q22.13 q22.13 37800000 39700000 gneg +band hs21 q22.2 q22.2 39700000 42600000 gpos50 +band hs21 q22.3 q22.3 42600000 48129895 gneg +band hs22 p13 p13 0 3800000 gvar +band hs22 p12 p12 3800000 8300000 stalk +band hs22 p11.2 p11.2 8300000 12200000 gvar +band hs22 p11.1 p11.1 12200000 14700000 acen +band hs22 q11.1 q11.1 14700000 17900000 acen +band hs22 q11.21 q11.21 17900000 22200000 gneg +band hs22 q11.22 q11.22 22200000 23500000 gpos25 +band hs22 q11.23 q11.23 23500000 25900000 gneg +band hs22 q12.1 q12.1 25900000 29600000 gpos50 +band hs22 q12.2 q12.2 29600000 32200000 gneg +band hs22 q12.3 q12.3 32200000 37600000 gpos50 +band hs22 q13.1 q13.1 37600000 41000000 gneg +band hs22 q13.2 q13.2 41000000 44200000 gpos50 +band hs22 q13.31 q13.31 44200000 48400000 gneg +band hs22 q13.32 q13.32 48400000 49400000 gpos50 +band hs22 q13.33 q13.33 49400000 51304566 gneg +band hs3 p26.3 p26.3 0 2800000 gpos50 +band hs3 p26.2 p26.2 2800000 4000000 gneg +band hs3 p26.1 p26.1 4000000 8700000 gpos50 +band hs3 p25.3 p25.3 8700000 11800000 gneg +band hs3 p25.2 p25.2 11800000 13300000 gpos25 +band hs3 p25.1 p25.1 13300000 16400000 gneg +band hs3 p24.3 p24.3 16400000 23900000 gpos100 +band hs3 p24.2 p24.2 23900000 26400000 gneg +band hs3 p24.1 p24.1 26400000 30900000 gpos75 +band hs3 p23 p23 30900000 32100000 gneg +band hs3 p22.3 p22.3 32100000 36500000 gpos50 +band hs3 p22.2 p22.2 36500000 39400000 gneg +band hs3 p22.1 p22.1 39400000 43700000 gpos75 +band hs3 p21.33 p21.33 43700000 44100000 gneg +band hs3 p21.32 p21.32 44100000 44200000 gpos50 +band hs3 p21.31 p21.31 44200000 50600000 gneg +band hs3 p21.2 p21.2 50600000 52300000 gpos25 +band hs3 p21.1 p21.1 52300000 54400000 gneg +band hs3 p14.3 p14.3 54400000 58600000 gpos50 +band hs3 p14.2 p14.2 58600000 63700000 gneg +band hs3 p14.1 p14.1 63700000 69800000 gpos50 +band hs3 p13 p13 69800000 74200000 gneg +band hs3 p12.3 p12.3 74200000 79800000 gpos75 +band hs3 p12.2 p12.2 79800000 83500000 gneg +band hs3 p12.1 p12.1 83500000 87200000 gpos75 +band hs3 p11.2 p11.2 87200000 87900000 gneg +band hs3 p11.1 p11.1 87900000 91000000 acen +band hs3 q11.1 q11.1 91000000 93900000 acen +band hs3 q11.2 q11.2 93900000 98300000 gvar +band hs3 q12.1 q12.1 98300000 100000000 gneg +band hs3 q12.2 q12.2 100000000 100900000 gpos25 +band hs3 q12.3 q12.3 100900000 102800000 gneg +band hs3 q13.11 q13.11 102800000 106200000 gpos75 +band hs3 q13.12 q13.12 106200000 107900000 gneg +band hs3 q13.13 q13.13 107900000 111300000 gpos50 +band hs3 q13.2 q13.2 111300000 113500000 gneg +band hs3 q13.31 q13.31 113500000 117300000 gpos75 +band hs3 q13.32 q13.32 117300000 119000000 gneg +band hs3 q13.33 q13.33 119000000 121900000 gpos75 +band hs3 q21.1 q21.1 121900000 123800000 gneg +band hs3 q21.2 q21.2 123800000 125800000 gpos25 +band hs3 q21.3 q21.3 125800000 129200000 gneg +band hs3 q22.1 q22.1 129200000 133700000 gpos25 +band hs3 q22.2 q22.2 133700000 135700000 gneg +band hs3 q22.3 q22.3 135700000 138700000 gpos25 +band hs3 q23 q23 138700000 142800000 gneg +band hs3 q24 q24 142800000 148900000 gpos100 +band hs3 q25.1 q25.1 148900000 152100000 gneg +band hs3 q25.2 q25.2 152100000 155000000 gpos50 +band hs3 q25.31 q25.31 155000000 157000000 gneg +band hs3 q25.32 q25.32 157000000 159000000 gpos50 +band hs3 q25.33 q25.33 159000000 160700000 gneg +band hs3 q26.1 q26.1 160700000 167600000 gpos100 +band hs3 q26.2 q26.2 167600000 170900000 gneg +band hs3 q26.31 q26.31 170900000 175700000 gpos75 +band hs3 q26.32 q26.32 175700000 179000000 gneg +band hs3 q26.33 q26.33 179000000 182700000 gpos75 +band hs3 q27.1 q27.1 182700000 184500000 gneg +band hs3 q27.2 q27.2 184500000 186000000 gpos25 +band hs3 q27.3 q27.3 186000000 187900000 gneg +band hs3 q28 q28 187900000 192300000 gpos75 +band hs3 q29 q29 192300000 198022430 gneg +band hs4 p16.3 p16.3 0 4500000 gneg +band hs4 p16.2 p16.2 4500000 6000000 gpos25 +band hs4 p16.1 p16.1 6000000 11300000 gneg +band hs4 p15.33 p15.33 11300000 15200000 gpos50 +band hs4 p15.32 p15.32 15200000 17800000 gneg +band hs4 p15.31 p15.31 17800000 21300000 gpos75 +band hs4 p15.2 p15.2 21300000 27700000 gneg +band hs4 p15.1 p15.1 27700000 35800000 gpos100 +band hs4 p14 p14 35800000 41200000 gneg +band hs4 p13 p13 41200000 44600000 gpos50 +band hs4 p12 p12 44600000 48200000 gneg +band hs4 p11 p11 48200000 50400000 acen +band hs4 q11 q11 50400000 52700000 acen +band hs4 q12 q12 52700000 59500000 gneg +band hs4 q13.1 q13.1 59500000 66600000 gpos100 +band hs4 q13.2 q13.2 66600000 70500000 gneg +band hs4 q13.3 q13.3 70500000 76300000 gpos75 +band hs4 q21.1 q21.1 76300000 78900000 gneg +band hs4 q21.21 q21.21 78900000 82400000 gpos50 +band hs4 q21.22 q21.22 82400000 84100000 gneg +band hs4 q21.23 q21.23 84100000 86900000 gpos25 +band hs4 q21.3 q21.3 86900000 88000000 gneg +band hs4 q22.1 q22.1 88000000 93700000 gpos75 +band hs4 q22.2 q22.2 93700000 95100000 gneg +band hs4 q22.3 q22.3 95100000 98800000 gpos75 +band hs4 q23 q23 98800000 101100000 gneg +band hs4 q24 q24 101100000 107700000 gpos50 +band hs4 q25 q25 107700000 114100000 gneg +band hs4 q26 q26 114100000 120800000 gpos75 +band hs4 q27 q27 120800000 123800000 gneg +band hs4 q28.1 q28.1 123800000 128800000 gpos50 +band hs4 q28.2 q28.2 128800000 131100000 gneg +band hs4 q28.3 q28.3 131100000 139500000 gpos100 +band hs4 q31.1 q31.1 139500000 141500000 gneg +band hs4 q31.21 q31.21 141500000 146800000 gpos25 +band hs4 q31.22 q31.22 146800000 148500000 gneg +band hs4 q31.23 q31.23 148500000 151100000 gpos25 +band hs4 q31.3 q31.3 151100000 155600000 gneg +band hs4 q32.1 q32.1 155600000 161800000 gpos100 +band hs4 q32.2 q32.2 161800000 164500000 gneg +band hs4 q32.3 q32.3 164500000 170100000 gpos100 +band hs4 q33 q33 170100000 171900000 gneg +band hs4 q34.1 q34.1 171900000 176300000 gpos75 +band hs4 q34.2 q34.2 176300000 177500000 gneg +band hs4 q34.3 q34.3 177500000 183200000 gpos100 +band hs4 q35.1 q35.1 183200000 187100000 gneg +band hs4 q35.2 q35.2 187100000 191154276 gpos25 +band hs5 p15.33 p15.33 0 4500000 gneg +band hs5 p15.32 p15.32 4500000 6300000 gpos25 +band hs5 p15.31 p15.31 6300000 9800000 gneg +band hs5 p15.2 p15.2 9800000 15000000 gpos50 +band hs5 p15.1 p15.1 15000000 18400000 gneg +band hs5 p14.3 p14.3 18400000 23300000 gpos100 +band hs5 p14.2 p14.2 23300000 24600000 gneg +band hs5 p14.1 p14.1 24600000 28900000 gpos100 +band hs5 p13.3 p13.3 28900000 33800000 gneg +band hs5 p13.2 p13.2 33800000 38400000 gpos25 +band hs5 p13.1 p13.1 38400000 42500000 gneg +band hs5 p12 p12 42500000 46100000 gpos50 +band hs5 p11 p11 46100000 48400000 acen +band hs5 q11.1 q11.1 48400000 50700000 acen +band hs5 q11.2 q11.2 50700000 58900000 gneg +band hs5 q12.1 q12.1 58900000 62900000 gpos75 +band hs5 q12.2 q12.2 62900000 63200000 gneg +band hs5 q12.3 q12.3 63200000 66700000 gpos75 +band hs5 q13.1 q13.1 66700000 68400000 gneg +band hs5 q13.2 q13.2 68400000 73300000 gpos50 +band hs5 q13.3 q13.3 73300000 76900000 gneg +band hs5 q14.1 q14.1 76900000 81400000 gpos50 +band hs5 q14.2 q14.2 81400000 82800000 gneg +band hs5 q14.3 q14.3 82800000 92300000 gpos100 +band hs5 q15 q15 92300000 98200000 gneg +band hs5 q21.1 q21.1 98200000 102800000 gpos100 +band hs5 q21.2 q21.2 102800000 104500000 gneg +band hs5 q21.3 q21.3 104500000 109600000 gpos100 +band hs5 q22.1 q22.1 109600000 111500000 gneg +band hs5 q22.2 q22.2 111500000 113100000 gpos50 +band hs5 q22.3 q22.3 113100000 115200000 gneg +band hs5 q23.1 q23.1 115200000 121400000 gpos100 +band hs5 q23.2 q23.2 121400000 127300000 gneg +band hs5 q23.3 q23.3 127300000 130600000 gpos100 +band hs5 q31.1 q31.1 130600000 136200000 gneg +band hs5 q31.2 q31.2 136200000 139500000 gpos25 +band hs5 q31.3 q31.3 139500000 144500000 gneg +band hs5 q32 q32 144500000 149800000 gpos75 +band hs5 q33.1 q33.1 149800000 152700000 gneg +band hs5 q33.2 q33.2 152700000 155700000 gpos50 +band hs5 q33.3 q33.3 155700000 159900000 gneg +band hs5 q34 q34 159900000 168500000 gpos100 +band hs5 q35.1 q35.1 168500000 172800000 gneg +band hs5 q35.2 q35.2 172800000 176600000 gpos25 +band hs5 q35.3 q35.3 176600000 180915260 gneg +band hs6 p25.3 p25.3 0 2300000 gneg +band hs6 p25.2 p25.2 2300000 4200000 gpos25 +band hs6 p25.1 p25.1 4200000 7100000 gneg +band hs6 p24.3 p24.3 7100000 10600000 gpos50 +band hs6 p24.2 p24.2 10600000 11600000 gneg +band hs6 p24.1 p24.1 11600000 13400000 gpos25 +band hs6 p23 p23 13400000 15200000 gneg +band hs6 p22.3 p22.3 15200000 25200000 gpos75 +band hs6 p22.2 p22.2 25200000 27000000 gneg +band hs6 p22.1 p22.1 27000000 30400000 gpos50 +band hs6 p21.33 p21.33 30400000 32100000 gneg +band hs6 p21.32 p21.32 32100000 33500000 gpos25 +band hs6 p21.31 p21.31 33500000 36600000 gneg +band hs6 p21.2 p21.2 36600000 40500000 gpos25 +band hs6 p21.1 p21.1 40500000 46200000 gneg +band hs6 p12.3 p12.3 46200000 51800000 gpos100 +band hs6 p12.2 p12.2 51800000 52900000 gneg +band hs6 p12.1 p12.1 52900000 57000000 gpos100 +band hs6 p11.2 p11.2 57000000 58700000 gneg +band hs6 p11.1 p11.1 58700000 61000000 acen +band hs6 q11.1 q11.1 61000000 63300000 acen +band hs6 q11.2 q11.2 63300000 63400000 gneg +band hs6 q12 q12 63400000 70000000 gpos100 +band hs6 q13 q13 70000000 75900000 gneg +band hs6 q14.1 q14.1 75900000 83900000 gpos50 +band hs6 q14.2 q14.2 83900000 84900000 gneg +band hs6 q14.3 q14.3 84900000 88000000 gpos50 +band hs6 q15 q15 88000000 93100000 gneg +band hs6 q16.1 q16.1 93100000 99500000 gpos100 +band hs6 q16.2 q16.2 99500000 100600000 gneg +band hs6 q16.3 q16.3 100600000 105500000 gpos100 +band hs6 q21 q21 105500000 114600000 gneg +band hs6 q22.1 q22.1 114600000 118300000 gpos75 +band hs6 q22.2 q22.2 118300000 118500000 gneg +band hs6 q22.31 q22.31 118500000 126100000 gpos100 +band hs6 q22.32 q22.32 126100000 127100000 gneg +band hs6 q22.33 q22.33 127100000 130300000 gpos75 +band hs6 q23.1 q23.1 130300000 131200000 gneg +band hs6 q23.2 q23.2 131200000 135200000 gpos50 +band hs6 q23.3 q23.3 135200000 139000000 gneg +band hs6 q24.1 q24.1 139000000 142800000 gpos75 +band hs6 q24.2 q24.2 142800000 145600000 gneg +band hs6 q24.3 q24.3 145600000 149000000 gpos75 +band hs6 q25.1 q25.1 149000000 152500000 gneg +band hs6 q25.2 q25.2 152500000 155500000 gpos50 +band hs6 q25.3 q25.3 155500000 161000000 gneg +band hs6 q26 q26 161000000 164500000 gpos50 +band hs6 q27 q27 164500000 171115067 gneg +band hs7 p22.3 p22.3 0 2800000 gneg +band hs7 p22.2 p22.2 2800000 4500000 gpos25 +band hs7 p22.1 p22.1 4500000 7300000 gneg +band hs7 p21.3 p21.3 7300000 13800000 gpos100 +band hs7 p21.2 p21.2 13800000 16500000 gneg +band hs7 p21.1 p21.1 16500000 20900000 gpos100 +band hs7 p15.3 p15.3 20900000 25500000 gneg +band hs7 p15.2 p15.2 25500000 28000000 gpos50 +band hs7 p15.1 p15.1 28000000 28800000 gneg +band hs7 p14.3 p14.3 28800000 35000000 gpos75 +band hs7 p14.2 p14.2 35000000 37200000 gneg +band hs7 p14.1 p14.1 37200000 43300000 gpos75 +band hs7 p13 p13 43300000 45400000 gneg +band hs7 p12.3 p12.3 45400000 49000000 gpos75 +band hs7 p12.2 p12.2 49000000 50500000 gneg +band hs7 p12.1 p12.1 50500000 54000000 gpos75 +band hs7 p11.2 p11.2 54000000 58000000 gneg +band hs7 p11.1 p11.1 58000000 59900000 acen +band hs7 q11.1 q11.1 59900000 61700000 acen +band hs7 q11.21 q11.21 61700000 67000000 gneg +band hs7 q11.22 q11.22 67000000 72200000 gpos50 +band hs7 q11.23 q11.23 72200000 77500000 gneg +band hs7 q21.11 q21.11 77500000 86400000 gpos100 +band hs7 q21.12 q21.12 86400000 88200000 gneg +band hs7 q21.13 q21.13 88200000 91100000 gpos75 +band hs7 q21.2 q21.2 91100000 92800000 gneg +band hs7 q21.3 q21.3 92800000 98000000 gpos75 +band hs7 q22.1 q22.1 98000000 103800000 gneg +band hs7 q22.2 q22.2 103800000 104500000 gpos50 +band hs7 q22.3 q22.3 104500000 107400000 gneg +band hs7 q31.1 q31.1 107400000 114600000 gpos75 +band hs7 q31.2 q31.2 114600000 117400000 gneg +band hs7 q31.31 q31.31 117400000 121100000 gpos75 +band hs7 q31.32 q31.32 121100000 123800000 gneg +band hs7 q31.33 q31.33 123800000 127100000 gpos75 +band hs7 q32.1 q32.1 127100000 129200000 gneg +band hs7 q32.2 q32.2 129200000 130400000 gpos25 +band hs7 q32.3 q32.3 130400000 132600000 gneg +band hs7 q33 q33 132600000 138200000 gpos50 +band hs7 q34 q34 138200000 143100000 gneg +band hs7 q35 q35 143100000 147900000 gpos75 +band hs7 q36.1 q36.1 147900000 152600000 gneg +band hs7 q36.2 q36.2 152600000 155100000 gpos25 +band hs7 q36.3 q36.3 155100000 159138663 gneg +band hs8 p23.3 p23.3 0 2200000 gneg +band hs8 p23.2 p23.2 2200000 6200000 gpos75 +band hs8 p23.1 p23.1 6200000 12700000 gneg +band hs8 p22 p22 12700000 19000000 gpos100 +band hs8 p21.3 p21.3 19000000 23300000 gneg +band hs8 p21.2 p21.2 23300000 27400000 gpos50 +band hs8 p21.1 p21.1 27400000 28800000 gneg +band hs8 p12 p12 28800000 36500000 gpos75 +band hs8 p11.23 p11.23 36500000 38300000 gneg +band hs8 p11.22 p11.22 38300000 39700000 gpos25 +band hs8 p11.21 p11.21 39700000 43100000 gneg +band hs8 p11.1 p11.1 43100000 45600000 acen +band hs8 q11.1 q11.1 45600000 48100000 acen +band hs8 q11.21 q11.21 48100000 52200000 gneg +band hs8 q11.22 q11.22 52200000 52600000 gpos75 +band hs8 q11.23 q11.23 52600000 55500000 gneg +band hs8 q12.1 q12.1 55500000 61600000 gpos50 +band hs8 q12.2 q12.2 61600000 62200000 gneg +band hs8 q12.3 q12.3 62200000 66000000 gpos50 +band hs8 q13.1 q13.1 66000000 68000000 gneg +band hs8 q13.2 q13.2 68000000 70500000 gpos50 +band hs8 q13.3 q13.3 70500000 73900000 gneg +band hs8 q21.11 q21.11 73900000 78300000 gpos100 +band hs8 q21.12 q21.12 78300000 80100000 gneg +band hs8 q21.13 q21.13 80100000 84600000 gpos75 +band hs8 q21.2 q21.2 84600000 86900000 gneg +band hs8 q21.3 q21.3 86900000 93300000 gpos100 +band hs8 q22.1 q22.1 93300000 99000000 gneg +band hs8 q22.2 q22.2 99000000 101600000 gpos25 +band hs8 q22.3 q22.3 101600000 106200000 gneg +band hs8 q23.1 q23.1 106200000 110500000 gpos75 +band hs8 q23.2 q23.2 110500000 112100000 gneg +band hs8 q23.3 q23.3 112100000 117700000 gpos100 +band hs8 q24.11 q24.11 117700000 119200000 gneg +band hs8 q24.12 q24.12 119200000 122500000 gpos50 +band hs8 q24.13 q24.13 122500000 127300000 gneg +band hs8 q24.21 q24.21 127300000 131500000 gpos50 +band hs8 q24.22 q24.22 131500000 136400000 gneg +band hs8 q24.23 q24.23 136400000 139900000 gpos75 +band hs8 q24.3 q24.3 139900000 146364022 gneg +band hs9 p24.3 p24.3 0 2200000 gneg +band hs9 p24.2 p24.2 2200000 4600000 gpos25 +band hs9 p24.1 p24.1 4600000 9000000 gneg +band hs9 p23 p23 9000000 14200000 gpos75 +band hs9 p22.3 p22.3 14200000 16600000 gneg +band hs9 p22.2 p22.2 16600000 18500000 gpos25 +band hs9 p22.1 p22.1 18500000 19900000 gneg +band hs9 p21.3 p21.3 19900000 25600000 gpos100 +band hs9 p21.2 p21.2 25600000 28000000 gneg +band hs9 p21.1 p21.1 28000000 33200000 gpos100 +band hs9 p13.3 p13.3 33200000 36300000 gneg +band hs9 p13.2 p13.2 36300000 38400000 gpos25 +band hs9 p13.1 p13.1 38400000 41000000 gneg +band hs9 p12 p12 41000000 43600000 gpos50 +band hs9 p11.2 p11.2 43600000 47300000 gneg +band hs9 p11.1 p11.1 47300000 49000000 acen +band hs9 q11 q11 49000000 50700000 acen +band hs9 q12 q12 50700000 65900000 gvar +band hs9 q13 q13 65900000 68700000 gneg +band hs9 q21.11 q21.11 68700000 72200000 gpos25 +band hs9 q21.12 q21.12 72200000 74000000 gneg +band hs9 q21.13 q21.13 74000000 79200000 gpos50 +band hs9 q21.2 q21.2 79200000 81100000 gneg +band hs9 q21.31 q21.31 81100000 84100000 gpos50 +band hs9 q21.32 q21.32 84100000 86900000 gneg +band hs9 q21.33 q21.33 86900000 90400000 gpos50 +band hs9 q22.1 q22.1 90400000 91800000 gneg +band hs9 q22.2 q22.2 91800000 93900000 gpos25 +band hs9 q22.31 q22.31 93900000 96600000 gneg +band hs9 q22.32 q22.32 96600000 99300000 gpos25 +band hs9 q22.33 q22.33 99300000 102600000 gneg +band hs9 q31.1 q31.1 102600000 108200000 gpos100 +band hs9 q31.2 q31.2 108200000 111300000 gneg +band hs9 q31.3 q31.3 111300000 114900000 gpos25 +band hs9 q32 q32 114900000 117700000 gneg +band hs9 q33.1 q33.1 117700000 122500000 gpos75 +band hs9 q33.2 q33.2 122500000 125800000 gneg +band hs9 q33.3 q33.3 125800000 130300000 gpos25 +band hs9 q34.11 q34.11 130300000 133500000 gneg +band hs9 q34.12 q34.12 133500000 134000000 gpos25 +band hs9 q34.13 q34.13 134000000 135900000 gneg +band hs9 q34.2 q34.2 135900000 137400000 gpos25 +band hs9 q34.3 q34.3 137400000 141213431 gneg +band hsX p22.33 p22.33 0 4300000 gneg +band hsX p22.32 p22.32 4300000 6000000 gpos50 +band hsX p22.31 p22.31 6000000 9500000 gneg +band hsX p22.2 p22.2 9500000 17100000 gpos50 +band hsX p22.13 p22.13 17100000 19300000 gneg +band hsX p22.12 p22.12 19300000 21900000 gpos50 +band hsX p22.11 p22.11 21900000 24900000 gneg +band hsX p21.3 p21.3 24900000 29300000 gpos100 +band hsX p21.2 p21.2 29300000 31500000 gneg +band hsX p21.1 p21.1 31500000 37600000 gpos100 +band hsX p11.4 p11.4 37600000 42400000 gneg +band hsX p11.3 p11.3 42400000 46400000 gpos75 +band hsX p11.23 p11.23 46400000 49800000 gneg +band hsX p11.22 p11.22 49800000 54800000 gpos25 +band hsX p11.21 p11.21 54800000 58100000 gneg +band hsX p11.1 p11.1 58100000 60600000 acen +band hsX q11.1 q11.1 60600000 63000000 acen +band hsX q11.2 q11.2 63000000 64600000 gneg +band hsX q12 q12 64600000 67800000 gpos50 +band hsX q13.1 q13.1 67800000 71800000 gneg +band hsX q13.2 q13.2 71800000 73900000 gpos50 +band hsX q13.3 q13.3 73900000 76000000 gneg +band hsX q21.1 q21.1 76000000 84600000 gpos100 +band hsX q21.2 q21.2 84600000 86200000 gneg +band hsX q21.31 q21.31 86200000 91800000 gpos100 +band hsX q21.32 q21.32 91800000 93500000 gneg +band hsX q21.33 q21.33 93500000 98300000 gpos75 +band hsX q22.1 q22.1 98300000 102600000 gneg +band hsX q22.2 q22.2 102600000 103700000 gpos50 +band hsX q22.3 q22.3 103700000 108700000 gneg +band hsX q23 q23 108700000 116500000 gpos75 +band hsX q24 q24 116500000 120900000 gneg +band hsX q25 q25 120900000 128700000 gpos100 +band hsX q26.1 q26.1 128700000 130400000 gneg +band hsX q26.2 q26.2 130400000 133600000 gpos25 +band hsX q26.3 q26.3 133600000 138000000 gneg +band hsX q27.1 q27.1 138000000 140300000 gpos75 +band hsX q27.2 q27.2 140300000 142100000 gneg +band hsX q27.3 q27.3 142100000 147100000 gpos100 +band hsX q28 q28 147100000 155270560 gneg +band hsY p11.32 p11.32 0 2500000 gneg +band hsY p11.31 p11.31 2500000 3000000 gpos50 +band hsY p11.2 p11.2 3000000 11600000 gneg +band hsY p11.1 p11.1 11600000 12500000 acen +band hsY q11.1 q11.1 12500000 13400000 acen +band hsY q11.21 q11.21 13400000 15100000 gneg +band hsY q11.221 q11.221 15100000 19800000 gpos50 +band hsY q11.222 q11.222 19800000 22100000 gneg +band hsY q11.223 q11.223 22100000 26200000 gpos50 +band hsY q11.23 q11.23 26200000 28800000 gneg +band hsY q12 q12 28800000 59373566 gvar diff -r c284618dd8da -r 091714bd75a0 SVDetect_r0.8b_galaxy/svdetect/BAM_preprocessingPairs.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SVDetect_r0.8b_galaxy/svdetect/BAM_preprocessingPairs.pl Tue Jan 22 06:20:22 2013 -0500 @@ -0,0 +1,340 @@ +#!/usr/bin/perl -w + +use strict; +use warnings; +use Getopt::Std; +my $version = '0.5_galaxy'; + +my $SAMTOOLS_BIN_DIR="/bioinfo/local/samtools"; + +my %opts = ( t=>1, p=>1, n=>1000000, f=>3, s=>0, S=>10000, o=>"." ); + +getopts('dt:p:n:f:s:S:o:b:l:x:N:', \%opts); #GALAXY + +my $working_dir=($opts{o} ne ".")? $opts{o}:"working directory"; + +my $pt_bad_mates_file=$opts{b}; #GALAXY +my $pt_log_file=$opts{l}; #GALAXY +my $pt_good_mates_file=$opts{x} if($opts{d}); #GALAXY + + +die(qq/ + +Description: + + Preprocessing of mates to get anomalously mapped mate-pair\/paired-end reads as input + for SVDetect. + + From all pairs mapped onto the reference genome, this script outputs abnormal pairs: + - mapped on two different chromosomes + - with an incorrect strand orientation and\/or pair order + - with an insert size distance +- sigma threshold + into a file + + -BAM\/SAM File input format only, sorted by read names + + Version : $version + SAMtools required for BAM files + + +Usage: BAM_preprocessingPairs.pl [options] + +Options: -t BOOLEAN read type: =1 (Illumina), =0 (SOLiD) [$opts{t}] + -p BOOLEAN pair type: =1 (paired-end), =0 (mate-pair) [$opts{p}] + -n INTEGER number of pairs for calculating mu and sigma lengths [$opts{n}] + -s INTEGER minimum value of ISIZE for calculating mu and sigma lengths [$opts{s}] + -S INTEGER maximum value of ISIZE for calculating mu and sigma lengths [$opts{S}] + -f REAL minimal number of sigma fold for filtering pairs [$opts{f}] + -d dump normal pairs into a file [] (optional) + -o STRING output directory [$working_dir] + +\n/) if (@ARGV == 0 && -t STDIN); + +unless (-d $opts{o}){ + mkdir $opts{o} or die; +} +$opts{o}.="/" if($opts{o}!~/\/$/); + +my $mates_file=shift(@ARGV); + +$mates_file=readlink($mates_file); + +my $bad_mates_file=(split(/\//,$mates_file))[$#_]; + +if($bad_mates_file=~/.(s|b)am$/){ + $bad_mates_file=~s/.(b|s)am$/.ab.sam/; + $bad_mates_file=$opts{o}.$bad_mates_file; +} + +else{ + die "Error: mate_file with the extension <.bam> or <.sam> needed !\n"; +} + +my $good_mates_file; +if($opts{d}){ + $good_mates_file=(split(/\//,$mates_file))[$#_]; + $good_mates_file=~s/.(b|s)am$/.norm.sam/; + $good_mates_file=$opts{o}.$good_mates_file; +} + +my $log_file=$opts{o}.$opts{N}.".svdetect_preprocessing.log"; #GALAXY + +#------------------------------------------------------------------------------# +#Calculate mu and sigma + +open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n"; + +print LOG "\# Calculating mu and sigma lengths...\n"; +print LOG "-- file=$mates_file\n"; +print LOG "-- n=$opts{n}\n"; +print LOG "-- ISIZE min=$opts{s}, max=$opts{S}\n"; + +my ($record, $sumX,$sumX2) = (0,0,0); +my $warn=$opts{n}/10; +my $prev_pair="FIRST"; + +my $bam=($mates_file =~ /.bam$/)? 1:0; + +if($bam){ + open(MATES, "${SAMTOOLS_BIN_DIR}/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; +}else{ + open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n"; +} + +while(){ + + my @t=split; + + next if ($t[0]=~/^@/); + + my $current_pair=$t[0]; + next if($current_pair eq $prev_pair); + $prev_pair=$current_pair; + + my ($chr1,$chr2,$length)=($t[2],$t[6],abs($t[8])); + + next if (($t[1]&0x0004) || ($t[1]&0x0008)); + next if ($length<$opts{s} || $length>$opts{S}) ; + + if($chr2 eq "="){ + + $sumX += $length; #add to sum and sum^2 for mean and variance calculation + $sumX2 += $length*$length; + $record++; + } + + if($record>$warn){ + print LOG "-- $warn pairs analysed\n"; + $warn+=$warn; + } + + last if ($record>$opts{n}); + +} +close (MATES); + +$record--; +my $mu = $sumX/$record; +my $sigma = sqrt($sumX2/$record - $mu*$mu); + +print LOG "-- Total : $record pairs analysed\n"; +print LOG "-- mu length = ".decimal($mu,1).", sigma length = ".decimal($sigma,1)."\n"; + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#Preprocessing pairs + +$warn=100000; + +$record=0; +my %count=( ab=>0, norm=>0, chr=>0, sense=>0, dist=>0, unmap=>0); + +my $read_type=($opts{t})? "Illumina":"SOLiD"; +my $pair_type=($opts{p})? "paired-end":"mate-paired"; + +print LOG "\# Preprocessing pairs...\n"; +print LOG "-- file= $mates_file\n"; +print LOG "-- type= $read_type $pair_type reads\n"; +print LOG "-- sigma threshold= $opts{f}\n"; +print LOG "-- using ".decimal($mu-$opts{f}*$sigma,4)."-".decimal($mu+$opts{f}*$sigma,4)." as normal range of insert size\n"; + +my @header; + +if($bam){ + open(HEADER, "${SAMTOOLS_BIN_DIR}/samtools view -H $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; + @header=
; + close HEADER; + open(MATES, "${SAMTOOLS_BIN_DIR}/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; +}else{ + open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n"; +} + +open AB, ">$bad_mates_file" or die "$0: can't write in the output: $bad_mates_file :$!\n"; +print AB @header if($bam); + +if($opts{d}){ + open NORM, ">$good_mates_file" or die "$0: can't write in the output: $good_mates_file :$!\n"; + print NORM @header if($bam); +} + +$prev_pair="FIRST"; +my $prev_bad; + +while(){ + + my @t=split; + my $bad=0; + + if ($t[0]=~/^@/){ + print AB; + print NORM if ($opts{d}); + next; + } + + my $current_pair=$t[0]; + if($current_pair eq $prev_pair){ + next if($prev_bad==-1); + if($prev_bad){ + print AB; + }elsif(!$prev_bad){ + print NORM if($opts{d}); + } + next; + } + + $prev_pair=$current_pair; + + my ($chr1,$chr2,$pos1,$pos2,$length)=($t[2],$t[6],$t[3],$t[7], abs($t[8])); + + if (($t[1]&0x0004) || ($t[1]&0x0008)){ + $prev_bad=-1; + $count{unmap}++; + $record++; + next; + + } + + my $strand1 = (($t[1]&0x0010))? 'R':'F'; + my $strand2 = (($t[1]&0x0020))? 'R':'F'; + my $order1 = (($t[1]&0x0040))? '1':'2'; + my $order2 = (($t[1]&0x0080))? '1':'2'; + + if($order1 == 2){ + ($strand1,$strand2)=($strand2,$strand1); + ($chr1,$chr2)=($chr2,$chr1); + ($pos1,$pos2)=($pos2,$pos1); + ($order1,$order2)=($order2,$order1); + } + + my $sense=$strand1.$strand2; + + if($chr1 ne "=" && $chr2 ne "="){ + $bad=1; + $count{chr}++; + } + + if($opts{p}){ #paired-end + if(!(($sense eq "FR" && $pos1<$pos2) || ($sense eq "RF" && $pos2<$pos1))){ + $bad=1; + $count{sense}++; + } + }else{ #mate-pair + if($opts{t}){ #Illumina + if(!(($sense eq "FR" && $pos2<$pos1) || ($sense eq "RF" && $pos1<$pos2))){ + $bad=1; + $count{sense}++; + } + }else{ #SOLiD + if(!(($sense eq "FF" && $pos2<$pos1) || ($sense eq "RR" && $pos1<$pos2))){ + $bad=1; + $count{sense}++; + } + } + } + + if(($chr1 eq "=" || $chr2 eq "=") && ($length <$mu - $opts{f}*$sigma || $length>$mu + $opts{f}*$sigma)){ + $bad=1; + $count{dist}++; + } + + if($bad){ + print AB; + $count{ab}++; + $prev_bad=$bad; + }else{ + print NORM if ($opts{d}); + $count{norm}++; + $prev_bad=$bad; + } + + $record++; + + if($record>$warn){ + print LOG "-- $warn pairs analysed\n"; + $warn+=100000; + } +} + +close AB; +close NORM if($opts{d}); + +print LOG "-- Total : $record pairs analysed\n"; +print LOG "-- $count{unmap} pairs whose one or both reads are unmapped\n"; +print LOG "-- ".($count{ab}+$count{norm})." mapped pairs\n"; +print LOG "---- $count{ab} abnormal mapped pairs\n"; +print LOG "------ $count{chr} pairs mapped on two different chromosomes\n"; +print LOG "------ $count{sense} pairs with incorrect strand orientation and\/or pair order\n"; +print LOG "------ $count{dist} pairs with incorrect insert size distance\n"; +print LOG "--- $count{norm} correct mapped pairs\n"; + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#OUTPUT + +if($bam){ + + my $bam_file=$bad_mates_file; + $bam_file=~s/.sam$/.bam/; + print LOG "\# Converting sam to bam for abnormal mapped pairs\n"; + system("${SAMTOOLS_BIN_DIR}/samtools view -bS $bad_mates_file > $bam_file 2>".$opts{o}."samtools.log"); + unlink($bad_mates_file); + print LOG "-- output created: $bam_file\n"; + + system "rm $pt_bad_mates_file ; ln -s $bam_file $pt_bad_mates_file"; #GALAXY + + if($opts{d}){ + $bam_file=$good_mates_file; + $bam_file=~s/.sam$/.bam/; + print LOG "\# Converting sam to bam for correct mapped pairs\n"; + system("${SAMTOOLS_BIN_DIR}/samtools view -bS $good_mates_file > $bam_file 2>".$opts{o}."samtools.log"); + unlink($good_mates_file); + print LOG "-- output created: $bam_file\n"; + + system "rm $pt_good_mates_file ; ln -s $bam_file $pt_good_mates_file"; #GALAXY + + } + +} + +else{ + print LOG "-- output created: $bad_mates_file\n"; + print LOG "-- output created: $good_mates_file\n" if($opts{d}); +} + +close LOG; + +system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY + + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub decimal{ + + my $num=shift; + my $digs_to_cut=shift; + + $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/); + + return $num; +} +#------------------------------------------------------------------------------# diff -r c284618dd8da -r 091714bd75a0 SVDetect_r0.8b_galaxy/svdetect/BAM_preprocessingPairs.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SVDetect_r0.8b_galaxy/svdetect/BAM_preprocessingPairs.xml Tue Jan 22 06:20:22 2013 -0500 @@ -0,0 +1,77 @@ + + + to get abnormal pairs + + BAM_preprocessingPairs.pl -t '$readType' -p '$pairType' -n '$nbrePair' -s '$isizeMin' -S '$isizeMax' -f '$foldPair' -o $__new_file_path__/svdetect -b '$abBAM' -l '$log' -N $sample_name + #if $newBam.pairNormal=="yes" + -d -x '$normBAM' + #end if + '$inputBam' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + newBam['pairNormal'] == 'yes' + + + + + +**What it does** + +Bam_preprocessingPairs - Version 0.5 + +Preprocessing of mates to get anomalously mapped mate-pair/paired-end reads as input for SVDetect. + +From all pairs mapped onto the reference genome, this script outputs abnormal pairs: + + * mapped on two different chromosomes + * with an incorrect strand orientation and/or pair order + * with an insert size distance +- sigma threshold + +into a file prefix.ab.bam/sam sorted by read names + +-BAM/SAM File input format only. + +SAMtools required for BAM files + +----- + +.. class:: infomark + +Contact Bruno Zeitouni (svdetect@curie.fr) for any questions or concerns about the Galaxy implementation of SVDetect. + + + + diff -r c284618dd8da -r 091714bd75a0 SVDetect_r0.8b_galaxy/svdetect/SVDetect_compare.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SVDetect_r0.8b_galaxy/svdetect/SVDetect_compare.pl Tue Jan 22 06:20:22 2013 -0500 @@ -0,0 +1,716 @@ +#!/usr/bin/perl -w + +=pod + +=head1 NAME + +SVDetect Compare for Galaxy + +Version: 0.8b for Galaxy + +=head1 SYNOPSIS + +SVDetect_compare.pl links2compare -conf [-help] [-man] + +=cut + +# ------------------------------------------------------------------- + +use strict; +use warnings; + +use Pod::Usage; +use Getopt::Long; + +use Config::General; +use Tie::IxHash; + +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# +#PARSE THE COMMAND LINE +my %OPT; +GetOptions(\%OPT, + 'conf=s', + 'out1=s', #GALAXY + 'out2=s', #GALAXY + 'out3=s', #GALAXY + 'out4=s', #GALAXY + 'out5=s', #GALAXY + 'out6=s', #GALAXY + 'out7=s', #GALAXY + 'out8=s', #GALAXY + 'out9=s', #GALAXY + 'l=s', #GALAXY + 'N=s', #GALAXY + 'help', + 'man' + ); + +pod2usage() if $OPT{help}; +pod2usage(-verbose=>2) if $OPT{man}; +pod2usage(-message=> "$!", -exitval => 2) if (!defined $OPT{conf}); + + +pod2usage() if(@ARGV<1); + +tie (my %func, 'Tie::IxHash',links2compare=>\&links2compare); + +foreach my $command (@ARGV){ + pod2usage(-message=> "Unknown command \"$command\"", -exitval => 2) if (!defined($func{$command})); +} +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# + +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# +#READ THE CONFIGURATION FILE +my $conf=Config::General->new( -ConfigFile => $OPT{conf}, + -Tie => "Tie::IxHash", + -AllowMultiOptions => 1, + -LowerCaseNames => 1, + -AutoTrue => 1); +my %CONF= $conf->getall; +validateconfiguration(\%CONF); #validation of the configuration parameters + + +my $SAMTOOLS_BIN_DIR="/bioinfo/local/samtools"; #GALAXY +my $BEDTOOLS_BIN_DIR="/bioinfo/local/BEDTools/bin"; #GALAXY + +my $pt_log_file=$OPT{l}; #GALAXY +my $log_file=$CONF{general}{output_dir}.$OPT{N}.".svdetect_compare.log"; #GALAXY +open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n";#GALAXY + +my @pt_sv_file=($OPT{out1},$OPT{out2},$OPT{out3}) if($OPT{out1}); #GALAXY common,sample,reference +my @pt_circos_file=($OPT{out4},$OPT{out5},$OPT{out6}) if($OPT{out4}); #GALAXY common,sample,reference +my @pt_bed_file=($OPT{out7},$OPT{out8},$OPT{out9}) if($OPT{out7}); #GALAXY common,sample,reference + +$CONF{compare}{sample_link_file}=readlink($CONF{compare}{sample_link_file});#GALAXY +$CONF{compare}{sample_link_file}=~s/.sv.txt//; #GALAXY + +$CONF{compare}{reference_link_file}=readlink($CONF{compare}{reference_link_file});#GALAXY +$CONF{compare}{reference_link_file}=~s/.sv.txt//; #GALAXY + +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# + +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# +#COMMAND EXECUTION +foreach my $command (@ARGV){ + &{$func{$command}}(); +} +print LOG "-- end\n"; + +close LOG;#GALAXY +system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY + +exit(0); +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# + +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# +#FUNCTIONS + +# -----------------------------------------------------------------------------# +#MAIN FUNCTION number 5:Comparison between samples, common or specific links +sub links2compare{ + + my @compare_files; + + compareSamples($CONF{general}{output_dir}, + $CONF{compare}{list_samples}, + $CONF{compare}{sample_link_file}, + $CONF{compare}{reference_link_file}, + $CONF{compare}{min_overlap}, + $CONF{compare}{same_sv_type}, + \@compare_files); + + my $pt_ind=0; + + for my $input_file (@compare_files){ + + $input_file=$CONF{general}{output_dir}.$input_file; + + my $output_file=$input_file; + $output_file=~s/unique$/compared/; + + sortLinks($input_file, $output_file,1); + + if($CONF{compare}{circos_output}){ + links2segdup($CONF{circos}{organism_id}, + $CONF{circos}{colorcode}, + $output_file, + $output_file.".segdup.txt"); + system "rm $pt_circos_file[$pt_ind]; ln -s $output_file.segdup.txt $pt_circos_file[$pt_ind]" if (defined $pt_circos_file[$pt_ind]); #GALAXY + } + + if($CONF{compare}{bed_output}){ + links2bedfile($CONF{compare}{read_lengths}, + $CONF{bed}{colorcode}, + $output_file, + $output_file.".bed"); + system "rm $pt_bed_file[$pt_ind]; ln -s $output_file.bed $pt_bed_file[$pt_ind]" if (defined $pt_bed_file[$pt_ind]); #GALAXY + } + + if($CONF{compare}{sv_output}){ + + links2SVfile ($output_file, $output_file.".sv.txt"); + system "rm $pt_sv_file[$pt_ind]; ln -s $output_file.sv.txt $pt_sv_file[$pt_ind]" if (defined $pt_sv_file[$pt_ind]); #GALAXY + } + $pt_ind++; + + } + unlink(@compare_files); + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub compareSamples{ + + my ($dir,$list_samples,$sample_file,$reference_file,$min_overlap,$same_sv_type,$file_names)=@_; + + my @bedpefiles; + my @list=split(",",$list_samples); + my @list_files=($sample_file,$reference_file); + + print LOG "\# Comparison procedure...\n"; + print LOG "-- samples=$list_samples\n". + "-- minimum overlap=$min_overlap\n". + "-- same SV type=$same_sv_type\n"; + + #conversion of links to bedPE format file + print LOG "-- Conversion of links.filtered files to bedPE format\n"; + for my $s (0..$#list) { + + links2bedPElinksfile($list[$s],$list_files[$s],$list_files[$s].".bedpe.txt"); + push(@bedpefiles,$list_files[$s].".bedpe.txt"); + + } + + #get common links between all samples compared + print LOG "-- Getting common links between all samples with BEDTools\n"; + my $common_name=join(".",@list); + + my $nb=scalar @list; + my $command=""; + my $prompt=">"; + + while ($nb>0){ + + for my $i (0..$#list_files){ + + $command.="$BEDTOOLS_BIN_DIR/pairToPair -type both -f $min_overlap -a ".$list_files[$i].".bedpe.txt"; + my $pipe=0; + + for my $j ($i+1..$#list_files){ + + $command.="| $BEDTOOLS_BIN_DIR/pairToPair -type both -f $min_overlap -a stdin" if($pipe); + $command.=" -b ".$list_files[$j].".bedpe.txt"; + $pipe=1; + + } + + $command.=$prompt.$dir.$common_name.".bedpe.tmp;"; + $prompt=">>"; + + my $first=shift(@list_files); push(@list_files,$first); + last; + } + $nb--; + } + + system ($command); + + push(@bedpefiles,$dir.$common_name.".bedpe.tmp"); + + #Post comparison to get common links if same type only (as an option) + open( FILE, "<".$dir.$common_name.".bedpe.tmp") or die "Can't open".$dir.$common_name.".bedpe.tmp : $!"; + open( OUT, ">".$dir.$common_name.".bedpe.unique") or die "Can't write in ".$dir.$common_name.".bedpe.unique : $!"; + + while(){ + my @t=split("\t",$_); + my $s=(split("_",$t[6]))[0]; + my ($sv1,$sv2)=($t[7],$t[18]); + splice(@t,11,$#t); + + if($same_sv_type){ + print OUT join("\t",@t)."\n" if($sv1 eq $sv2); + }else{ + print OUT join("\t",@t)."\n"; + } + } + close FILE; + close OUT; + + bedPElinks2linksfile($dir.$common_name.".bedpe.unique", $dir.$common_name.".unique"); + push(@bedpefiles,$dir.$common_name.".bedpe.unique"); + push(@$file_names,$common_name.".unique"); + print LOG "-- output created: ".$dir.$common_name.".compared\n"; + + #get specific links for each sample + print LOG "-- Getting specific links for each sample\n"; + for my $s (0..$#list) { + system("grep -Fxv -f ".$dir.$common_name.".bedpe.unique ".$list_files[$s].".bedpe.txt >".$dir.$list[$s].".bedpe.unique"); + bedPElinks2linksfile($dir.$list[$s].".bedpe.unique",$dir.$list[$s].".unique"); + push(@bedpefiles,$dir.$list[$s].".bedpe.unique"); + push(@$file_names,$list[$s].".unique"); + print LOG "-- output created: ".$dir.$list[$s].".compared\n"; + } + + unlink(@bedpefiles); + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#convert the links file to the circos format +sub links2segdup{ + + my($id,$color_code,$links_file,$segdup_file)=@_; + + print LOG "\# Converting to the circos format...\n"; + + tie (my %hcolor,'Tie::IxHash'); #color-code hash table + foreach my $col (keys %{$color_code}){ + my ($min_links,$max_links)=split(",",$color_code->{$col}); + $hcolor{$col}=[$min_links,$max_links]; + } + + open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n"; + open SEGDUP, ">$segdup_file" or die "$0: can't write in the output: $segdup_file :$!\n"; + + my $index=1; + while(){ + + my ($chr1,$start1,$end1,$chr2,$start2,$end2,$count)=(split)[0,1,2,3,4,5,6]; + + my $color=getColor($count,\%hcolor,"circos"); #get the color-code according the number of links + + print SEGDUP "$index\t$id$chr1\t$start1\t$end1\tcolor=$color\n". #circos output + "$index\t$id$chr2\t$start2\t$end2\tcolor=$color\n"; + $index++; + } + + close LINKS; + close SEGDUP; + print LOG "-- output created: $segdup_file\n"; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#convert the links file to the bedPE format for BEDTools usage +sub links2bedPElinksfile{ + + my ($sample,$links_file,$bedpe_file)=@_; + + open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n"; + open BEDPE, ">$bedpe_file" or die "$0: can't write in the output: $bedpe_file :$!\n"; + + my $nb_links=1; + + while(){ + + chomp; + my @t=split("\t",$_); + my ($chr1,$start1,$end1,$chr2,$start2,$end2)=splice(@t,0,6); + my $type=($chr1 eq $chr2)? "INTRA":"INTER"; + $type.="_".$t[10]; + + $start1--; $start2--; + + print BEDPE "$chr1\t$start1\t$end1\t$chr2\t$start2\t$end2". + "\t$sample"."_link$nb_links\t$type\t.\t.". + "\t".join("|",@t)."\n"; + + $nb_links++; + } + + close LINKS; + close BEDPE; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub bedPElinks2linksfile{ + + my ($bedpe_file,$links_file)=@_; + + open BEDPE, "<$bedpe_file" or die "$0: can't open: $bedpe_file :$!\n"; + open LINKS, ">$links_file" or die "$0: can't write in the output $links_file :$!\n"; + + while(){ + + chomp; + my $sample=(split("_",(split("\t",$_))[6]))[0]; + my @t1=(split("\t",$_))[0,1,2,3,4,5]; + my @t2=split(/\|/,(split("\t",$_))[10]); + push(@t2,$sample); + + print LINKS join("\t",@t1)."\t".join("\t",@t2)."\n"; + + } + close BEDPE; + close LINKS; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#convert the links file to the bed format +sub links2bedfile{ + + my ($tag_length,$color_code,$links_file,$bed_file)=@_; + + print LOG "\# Converting to the bed format...\n"; + + my $compare=1; + if($links_file!~/compared$/){ + $compare=0; + $tag_length->{none}->{1}=$tag_length->{1}; + $tag_length->{none}->{2}=$tag_length->{2}; + } + + #color-code hash table + tie (my %hcolor,'Tie::IxHash'); + my %color_order; + $color_order{"255,255,255"}=0; + my $n=1; + foreach my $col (keys %{$color_code}){ + my ($min_links,$max_links)=split(",",$color_code->{$col}); + $hcolor{$col}=[$min_links,$max_links]; + $color_order{$col}=$n; + $n++; + } + + my %pair; + my %pt; + $n=1; + open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n"; + + my %str=( "F"=>"+", "R"=>"-" ); + + while(){ + + my @t=split; + my $sample=($compare)? pop(@t):"none"; + + my $chr1=$t[0]; + my $chr2=$t[3]; + $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i); + $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i); + my $same_chr=($chr1 eq $chr2)? 1:0; + + my $count=$t[6]; + my $color=getColor($count,\%hcolor,"bed"); + + my @pairs=deleteBadOrderSensePairs(split(",",$t[7])); + my @strand1=deleteBadOrderSensePairs(split(",",$t[8])); + my @strand2=deleteBadOrderSensePairs(split(",",$t[9])); + my @ends_order1=deleteBadOrderSensePairs(split(",",$t[10])); + my @ends_order2=deleteBadOrderSensePairs(split(",",$t[11])); + my @position1=deleteBadOrderSensePairs(split(",",$t[14])); + my @position2=deleteBadOrderSensePairs(split(",",$t[15])); + my @start1; my @end1; getCoordswithLeftMost(\@start1,\@end1,\@position1,\@strand1,\@ends_order1,$tag_length->{$sample}); + my @start2; my @end2; getCoordswithLeftMost(\@start2,\@end2,\@position2,\@strand2,\@ends_order2,$tag_length->{$sample}); + + + for my $p (0..$#pairs){ + + if (!exists $pair{$pairs[$p]}){ + + if($same_chr){ + + $pair{$pairs[$p]}->{0}=[ $chr1, $start1[$p]-1, $end2[$p], $pairs[$p], 0, $str{$strand1[$p]}, + $start1[$p]-1, $end2[$p], $color, + 2, $tag_length->{$sample}->{$ends_order1[$p]}.",".$tag_length->{$sample}->{$ends_order2[$p]}, "0,".($start2[$p]-$start1[$p]) ]; + $pt{$n}=$pair{$pairs[$p]}->{0}; + $n++; + + }else{ + + $pair{$pairs[$p]}->{1}=[ $chr1, $start1[$p]-1, $end1[$p] , $pairs[$p]."/1", 0, $str{$strand1[$p]}, + $start1[$p]-1, $end1[$p], $color, + 1, $tag_length->{$sample}->{$ends_order1[$p]}, 0]; + $pt{$n}=$pair{$pairs[$p]}->{1}; + $n++; + + + $pair{$pairs[$p]}->{2}=[ $chr2, $start2[$p]-1, $end2[$p], $pairs[$p]."/2", 0, $str{$strand2[$p]}, + $start2[$p]-1, $end2[$p], $color, + 1, $tag_length->{$sample}->{$ends_order2[$p]}, 0]; + $pt{$n}=$pair{$pairs[$p]}->{2}; + $n++; + } + }else{ + + if($same_chr){ + ${$pair{$pairs[$p]}->{0}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{0}}[8]}); + }else{ + ${$pair{$pairs[$p]}->{1}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{1}}[8]}); + ${$pair{$pairs[$p]}->{2}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{2}}[8]}); + } + } + } + } + close LINKS; + + my $nb_pairs=$n-1; + + open BED, ">$bed_file" or die "$0: can't write in the output: $bed_file :$!\n"; + print BED "track name=\"$bed_file\" description=\"mate pairs involved in links\" ". + "visibility=2 itemRgb=\"On\"\n"; + + for my $i (1..$nb_pairs){ + print BED join("\t",@{$pt{$i}})."\n"; + } + + close BED; + + print LOG "-- output created: $bed_file\n"; + + undef %pair; + undef %pt; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub links2SVfile{ + + my($links_file,$sv_file)=@_; + + print LOG "\# Converting to the sv output table...\n"; + open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n"; + open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n"; + + my @header=qw(chr_type SV_type BAL_type chromosome1 start1-end1 average_dist + chromosome2 start2-end2 nb_pairs score_strand_filtering score_order_filtering score_insert_size_filtering + final_score breakpoint1_start1-end1 breakpoint2_start2-end2); + + my $nb_links=0; + + while (){ + + my @t=split; + my @sv=(); + my $sv_type="-"; + my $strand_ratio="-"; + my $eq_ratio="-"; + my $eq_type="-"; + my $insert_ratio="-"; + my $link="-"; + my ($bk1, $bk2)=("-","-"); + my $score="-"; + + my ($chr1,$start1,$end1)=($t[0],$t[1],$t[2]); + my ($chr2,$start2,$end2)=($t[3],$t[4],$t[5]); + my $nb_pairs=$t[6]; + $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i); + $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i); + my $chr_type=($chr1 eq $chr2)? "INTRA":"INTER"; + + #if strand filtering + if (defined $t[16]){ + #if inter-chr link + $sv_type=$t[16]; + if(defined $t[17] && $t[17]=~/^(\d+)\/(\d+)$/){ + $strand_ratio=floor($1/$2*100)."%"; + $score=$t[18]; + } + if(defined $t[18] && $t[18]=~/^(\d+)\/(\d+)$/){ + #if intra-chr link with insert size filtering + $strand_ratio=floor($1/$2*100)."%"; + $link=floor($t[17]); + if($sv_type!~/^INV/){ + $insert_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/); + $score=$t[20]; + }else{ + $score=$t[19]; + } + } + } + + if(defined $t[18] && ($t[18] eq "UNBAL" || $t[18] eq "BAL")){ + + #if strand and order filtering only and/or interchr link + $eq_type=$t[18]; + $eq_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/); + ($bk1, $bk2)=($t[20],$t[21]); + foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;} + $score=$t[22]; + + }elsif(defined $t[19] && ($t[19] eq "UNBAL" || $t[19] eq "BAL")){ + + #if all three filtering + $link=floor($t[17]); + $eq_type=$t[19]; + $eq_ratio=floor($1/$2*100)."%" if($t[20]=~/^(\d+)\/(\d+)$/); + + if(defined $t[21] && $t[21]=~/^(\d+)\/(\d+)$/){ + $insert_ratio=floor($1/$2*100)."%"; + ($bk1, $bk2)=($t[22],$t[23]); + $score=$t[24]; + + }else{ + ($bk1, $bk2)=($t[21],$t[22]); + $score=$t[23]; + } + foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;} + + } + + + push(@sv, $chr_type, $sv_type,$eq_type); + push(@sv,"$chr1\t$start1-$end1"); + push(@sv, $link); + push(@sv,"$chr2\t$start2-$end2", + $nb_pairs,$strand_ratio,$eq_ratio,$insert_ratio, decimal($score,4), $bk1, $bk2); + + + print SV join("\t",@sv)."\n"; + } + + close LINKS; + close SV; + + system "sort -k 9,9nr -k 13,13nr $sv_file > $sv_file.sorted"; + + open SV, "<".$sv_file.".sorted" or die "$0: can't open in the output: $sv_file".".sorted :$!\n"; + my @links=; + close SV; + + open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n"; + + print SV join("\t",@header)."\n"; + print SV @links; + close SV; + + unlink($sv_file.".sorted"); + + print LOG "-- output created: $sv_file\n"; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub deleteBadOrderSensePairs{ + + my (@tab)=@_; + my @tab2; + + foreach my $v (@tab){ + + $v=~s/[\(\)]//g; + push(@tab2,$v) if($v!~/[\$\*\@]$/); + } + return @tab2; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#gets starts and ends Coords when start=leftmost given positions, directions and orders +sub getCoordswithLeftMost { + + my ($starts,$ends,$positions,$strand,$end_order,$tag_length) = @_; + + for my $i (0..scalar(@{$positions})-1) { + + if($strand->[$i] eq 'F'){ + $starts->[$i]=$positions->[$i]; + $ends->[$i]=$positions->[$i]+$tag_length->{$end_order->[$i]}-1; + }else{ + $starts->[$i]=$positions->[$i]-$tag_length->{$end_order->[$i]}+1; + $ends->[$i]=$positions->[$i]; + } + } +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub floor { + my $nb = $_[0]; + $nb=~ s/\..*//; + return $nb; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub decimal{ + + my $num=shift; + my $digs_to_cut=shift; + + $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/); + + return $num; +} + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#Sort links according the concerned chromosomes and their coordinates +sub sortLinks{ + + my ($links_file,$sortedlinks_file,$unique)=@_; + + print LOG "# Sorting links...\n"; + + my $pipe=($unique)? "| sort -u":""; + system "sort -k 1,1 -k 4,4 -k 2,2n -k 5,5n -k 8,8n $links_file $pipe > $sortedlinks_file"; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub getColor{ + + my($count,$hcolor,$format)=@_; + for my $col ( keys % { $hcolor} ) { + return $col if($count>=$hcolor->{$col}->[0] && $count<=$hcolor->{$col}->[1]); + } + return "white" if($format eq "circos"); + return "255,255,255" if($format eq "bed"); +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#check if the configuration file is correct +sub validateconfiguration{ + + my %conf=%{$_[0]}; + my $list_prgs="@ARGV"; + + my @circos_params=qw(organism_id colorcode); + my @bed_params=qw(colorcode); + my @compare_params=qw(list_samples list_read_lengths sample_link_file reference_link_file); + + unless (defined($conf{general}{output_dir})) { + $conf{general}{output_dir} = "."; + } + unless (-d $conf{general}{output_dir}){ + mkdir $conf{general}{output_dir} or die; + } + $conf{general}{output_dir}.="/" if($conf{general}{output_dir}!~/\/$/); + + + if($list_prgs=~/links2compare/){ + foreach my $p (@compare_params) { + die("Error Config : The compare parameter \"$p\" is not defined\n") if (!defined $conf{compare}{$p}); + } + + unless (defined($conf{compare}{same_sv_type})) { + $conf{compare}{same_sv_type} = 0; + } + + unless (defined($conf{compare}{min_overlap})) { + $conf{compare}{min_overlap} = 1E-9; + } + + if($conf{compare}{circos_output}){ + foreach my $p (@circos_params) { + next if($list_prgs=~/^ratio/ && $p eq "colorcode"); + die("Error Config : The circos parameter \"$p\" is not defined\n") if (!defined $conf{circos}{$p}); + } + } + if($conf{compare}{bed_output}){ + foreach my $p (@bed_params) { + die("Error Config : The bed parameter \"$p\" is not defined\n") if (!defined $conf{bed}{$p}); + } + die("Error Config : The compare parameter \"list_read_lengths\" is not defined\n") if (!defined $conf{compare}{list_read_lengths}); + + my @samples=split(",",$conf{compare}{list_samples}); + my @read_lengths=split(",",$conf{compare}{list_read_lengths}); + for my $i (0..$#samples){ + my @l=split("-",$read_lengths[$i]); + $conf{compare}{read_lengths}{$samples[$i]}={ 1=> $l[0], 2=> $l[1]}; + } + } + } + + +} +#------------------------------------------------------------------------------# +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# diff -r c284618dd8da -r 091714bd75a0 SVDetect_r0.8b_galaxy/svdetect/SVDetect_compare.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SVDetect_r0.8b_galaxy/svdetect/SVDetect_compare.xml Tue Jan 22 06:20:22 2013 -0500 @@ -0,0 +1,218 @@ + + +structural variants between two samples + +SVDetect_compare.pl links2compare -conf '$config_file' -l '$log_file' -N '$sample_name.$reference_name' + +#if $links2SV +-out1 '$common_sv_file' +-out2 '$sample_sv_file' +-out3 '$reference_sv_file' +#end if + +#if $file_conversion.file_conversion_select=="convert" and $file_conversion.links2circos +-out4 '$common_circos_file' +-out5 '$sample_circos_file' +-out6 '$reference_circos_file' +#end if + +#if $file_conversion.file_conversion_select=="convert" and $file_conversion.links2bed +-out7 '$common_bed_file' +-out8 '$sample_bed_file' +-out9 '$reference_bed_file' +#end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + links2SV is True + + + links2SV is True + + + links2SV is True + + + + ( + file_conversion['file_conversion_select']=="convert" and + file_conversion['links2circos'] is True + ) + + + + ( + file_conversion['file_conversion_select']=="convert" and + file_conversion['links2circos'] is True + ) + + + + ( + file_conversion['file_conversion_select']=="convert" and + file_conversion['links2circos'] is True + ) + + + + + ( + file_conversion['file_conversion_select']=="convert" and + file_conversion['links2bed'] is True + ) + + + + ( + file_conversion['file_conversion_select']=="convert" and + file_conversion['links2bed'] is True + ) + + + + ( + file_conversion['file_conversion_select']=="convert" and + file_conversion['links2bed'] is True + ) + + + + + + + + + + +<general> +output_dir=$__new_file_path__/svdetect +</general> + +#if $file_conversion.file_conversion_select == "convert" +#if $file_conversion.links2circos +<circos> +organism_id=${file_conversion.organism_id} +<colorcode> +#for $color_repeat in $file_conversion.color_code +${color_repeat.color}=${color_repeat.interval} +#end for +</colorcode> +</circos> +#end if +#if $file_conversion.links2bed +<bed> +<colorcode> +#for $color_repeat in $file_conversion.color_code +#if str($color_repeat.color)== "grey" +190,190,190=${color_repeat.interval} +#end if +#if str($color_repeat.color)== "black" +0,0,0=${color_repeat.interval} +#end if +#if str($color_repeat.color)== "blue" +0,0,255=${color_repeat.interval} +#end if +#if str($color_repeat.color)== "green" +0,255,0=${color_repeat.interval} +#end if +#if str($color_repeat.color)== "purple" +153,50,205=${color_repeat.interval} +#end if +#if str($color_repeat.color)== "orange" +255,140,0=${color_repeat.interval} +#end if +#if str($color_repeat.color)== "red" +255,0,0=${color_repeat.interval} +#end if +#end for +</colorcode> +</bed> +#end if +#end if + +<compare> +list_samples=${sample_name},${reference_name} +list_read_lengths=${sample_read1_length}-${sample_read2_length},${reference_read1_length}-${reference_read2_length} +sample_link_file=${sample_mates_file} +reference_link_file=${reference_mates_file} +min_overlap=${min_overlap} +same_sv_type=${same_sv_type} +sv_output=${links2SV} +#if $file_conversion.file_conversion_select == "convert" +circos_output=${$file_conversion.links2circos} +bed_output=${$file_conversion.links2bed} +#end if +</compare> + + + + + +**What it does** + +SVDetect - Version : 0.8b + +Comparison of clusters between two samples to get common or sample-specific SVs + +This program is designed to compare filtered links between two anomalously mapped mate-pair/paired-end datasets +and to identify common and sample-specific SVs (like the usual sample/reference design). +Overlaps between coordinates of clusters and types of SVs are used as parameters of comparison. + +Manual documentation available at the http://svdetect.sourceforge.net/Site/Manual.html + +----- + +.. class:: infomark + +Contact Bruno Zeitouni (svdetect@curie.fr) for any questions or concerns about the Galaxy implementation of SVDetect. + + + diff -r c284618dd8da -r 091714bd75a0 SVDetect_r0.8b_galaxy/svdetect/SVDetect_import.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SVDetect_r0.8b_galaxy/svdetect/SVDetect_import.sh Tue Jan 22 06:20:22 2013 -0500 @@ -0,0 +1,15 @@ +#!/bin/bash + + +while getopts "i:o:" optionName; do +case "$optionName" in + +i) INPUT="$OPTARG";; +o) OUTPUT="$OPTARG";; + +esac +done + +rm $OUTPUT + +ln -s $INPUT $OUTPUT diff -r c284618dd8da -r 091714bd75a0 SVDetect_r0.8b_galaxy/svdetect/SVDetect_import.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SVDetect_r0.8b_galaxy/svdetect/SVDetect_import.xml Tue Jan 22 06:20:22 2013 -0500 @@ -0,0 +1,85 @@ + + BAM, chromosome info or sv files + SVDetect_import.sh -i $file_path + #if str($type.file_type)=="bam" + -o $outbamfile + #elif str($type.file_type)=="len" + -o $outlenfile + #elif str($type.file_type)=="sv" + -o $outsvfile + #end if + + + + + + + + + + + + + + + + + + + + + + + + type['file_type']=="bam" + + + type['file_type']=="len" + + + type['file_type']=="sv" + + + +**What it does** + +This tool allows you to import quickly a BAM file, a chromosome info file or a SVDetect output file from you computer as inputs for SVDetect. + + +**Example of chromosome file** + +Input len file:: + + 1 chr1 247249719 + 2 chr2 242951149 + 3 chr3 199501827 + 4 chr4 191273063 + 5 chr5 180857866 + 6 chr6 170899992 + 7 chr7 158821424 + 8 chr8 146274826 + 9 chr9 140273252 + 10 chr10 135374737 + 11 chr11 134452384 + 12 chr12 132349534 + 13 chr13 114142980 + 14 chr14 106368585 + 15 chr15 100338915 + 16 chr16 88827254 + 17 chr17 78774742 + 18 chr18 76117153 + 19 chr19 63811651 + 20 chr20 62435964 + 21 chr21 46944323 + 22 chr22 49691432 + 23 chrX 154913754 + 24 chrY 57772954 + +----- + +.. class:: infomark + +Contact Bruno Zeitouni (svdetect@curie.fr) for any questions or concerns about the Galaxy implementation of SVDetect. + + + diff -r c284618dd8da -r 091714bd75a0 SVDetect_r0.8b_galaxy/svdetect/SVDetect_run_parallel.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SVDetect_r0.8b_galaxy/svdetect/SVDetect_run_parallel.pl Tue Jan 22 06:20:22 2013 -0500 @@ -0,0 +1,3537 @@ +#!/usr/bin/perl -w + +=pod + +=head1 NAME + +SVDetect - Program designed to the detection of structural variations +from paired-end/mate-pair sequencing data, compatible with SOLiD and Illumina (>=1.3) reads + +Version: 0.8b for Galaxy + +=head1 SYNOPSIS + +SVDetect -conf [-help] [-man] + + Command: + + linking detection and isolation of links + filtering filtering of links according different parameters + links2circos links conversion to circos format + links2bed paired-ends of links converted to bed format (UCSC) + links2SV formatted output to show most significant SVs + cnv calculate copy-number profiles + ratio2circos ratio conversion to circos density format + ratio2bedgraph ratio conversion to bedGraph density format (UCSC) + +=head1 DESCRIPTION + +This is a command-line interface to SVDetect. + + +=head1 AUTHORS + +Bruno Zeitouni Ebruno.zeitouni@curie.frE, +Valentina Boeva Evalentina.boeva@curie.frE + +=cut + +# ------------------------------------------------------------------- + +use strict; +use warnings; + +use Pod::Usage; +use Getopt::Long; + +use Config::General; +use Tie::IxHash; +use FileHandle; +use Parallel::ForkManager; + +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# +#PARSE THE COMMAND LINE +my %OPT; +GetOptions(\%OPT, + 'conf=s', + 'out1=s', #GALAXY + 'out2=s', #GALAXY + 'out3=s', #GALAXY + 'out4=s', #GALAXY + 'out5=s', #GALAXY + 'l=s', #GALAXY + 'N=s',#GALAXY + 'help',#GALAXY + 'man' + ); + +pod2usage() if $OPT{help}; +pod2usage(-verbose=>2) if $OPT{man}; +pod2usage(-message=> "$!", -exitval => 2) if (!defined $OPT{conf}); + +pod2usage() if(@ARGV<1); + +tie (my %func, 'Tie::IxHash',linking=>\&createlinks, + filtering=>\&filterlinks, + links2circos=>\&links2circos, + links2bed=>\&links2bed, + links2compare=>\&links2compare, + links2SV=>\&links2SV, + cnv=>\&cnv, + ratio2circos=>\&ratio2circos, + ratio2bedgraph=>\&ratio2bedgraph); + +foreach my $command (@ARGV){ + pod2usage(-message=> "Unknown command \"$command\"", -exitval => 2) if (!defined($func{$command})); +} +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# + + +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# +#READ THE CONFIGURATION FILE +my $conf=Config::General->new( -ConfigFile => $OPT{conf}, + -Tie => "Tie::IxHash", + -AllowMultiOptions => 1, + -LowerCaseNames => 1, + -AutoTrue => 1); +my %CONF= $conf->getall; +validateconfiguration(\%CONF); #validation of the configuration parameters + +my $SAMTOOLS_BIN_DIR="/bioinfo/local/samtools"; #GALAXY + +my $pt_log_file=$OPT{l}; #GALAXY +my $pt_links_file=$OPT{out1} if($OPT{out1}); #GALAXY +my $pt_flinks_file=$OPT{out2} if($OPT{out2}); #GALAXY +my $pt_sv_file=$OPT{out3} if($OPT{out3}); #GALAXY +my $pt_circos_file=$OPT{out4} if($OPT{out4}); #GALAXY +my $pt_bed_file=$OPT{out5} if($OPT{out5}); #GALAXY + +$CONF{general}{mates_file}=readlink($CONF{general}{mates_file});#GALAXY +$CONF{general}{cmap_file}=readlink($CONF{general}{cmap_file});#GALAXY + +my $log_file=$CONF{general}{output_dir}.$OPT{N}.".svdetect_run.log"; #GALAXY +open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n";#GALAXY +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# + +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# +#COMMAND EXECUTION +foreach my $command (@ARGV){ + &{$func{$command}}(); +} +print LOG "-- end\n";#GALAXY + +close LOG;#GALAXY +system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY +exit(0); +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# + + +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# +#FUNCTIONS +#------------------------------------------------------------------------------# +#MAIN FUNCTION number 1: Detection of links from mate-pairs data +sub createlinks{ + + my %CHR; #main hash table 1: fragments, links + my %CHRID; + my @MATEFILES; + + my $output_prefix=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}; + my @path=split(/\//,$output_prefix); + $output_prefix=$CONF{general}{output_dir}.$path[$#path]; + my $tmp_mates_prefix=$CONF{general}{tmp_dir}."mates/".$path[$#path]; + my $tmp_links_prefix=$CONF{general}{tmp_dir}."links/".$path[$#path]; + + shearingChromosome(\%CHR, \%CHRID, #making the genomic fragment library with the detection parameters + $CONF{detection}{window_size}, + $CONF{detection}{step_length}, + $CONF{general}{cmap_file}); + + if($CONF{detection}{split_mate_file}){ + + splitMateFile(\%CHR, \%CHRID, \@MATEFILES, $tmp_mates_prefix, + $CONF{general}{sv_type}, + $CONF{general}{mates_file}, + $CONF{general}{input_format}, + $CONF{general}{read_lengths} + ); + }else{ + + @MATEFILES=qx{ls $tmp_mates_prefix*} or die "# Error: No splitted mate files already created at $CONF{general}{tmp_dir} :$!"; + chomp(@MATEFILES); + print LOG "# Splitted mate files already created.\n"; + } + + + #Parallelization of the linking per chromosome for intra + interchrs + my $pm = new Parallel::ForkManager($CONF{general}{num_threads}); + + foreach my $matefile (@MATEFILES){ + + my $pid = $pm->start and next; + getlinks(\%CHR, \%CHRID, $matefile); + $pm->finish; + + } + $pm->wait_all_children; + + #Merge the chromosome links file into only one + my @LINKFILES= qx{ls $tmp_links_prefix*links} or die "# Error: No links files created at $CONF{general}{tmp_dir} :$!"; + chomp(@LINKFILES); + catFiles( \@LINKFILES => "$output_prefix.links" ); + + system "rm $pt_links_file; ln -s $output_prefix.links $pt_links_file" if (defined $pt_links_file); #GALAXY + print LOG "# Linking end procedure : output created: $output_prefix.links\n"; + #unlink(@LINKFILES); + #unlink(@MATEFILES); + + undef %CHR; + undef %CHRID; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub getlinks { + + my ($chr,$chrID,$tmp_mates_prefix)=@_; + + my $tmp_links_prefix=$tmp_mates_prefix; + $tmp_links_prefix=~s/\/mates\//\/links\//; + + my %PAIR; #main hash table 2: pairs + + linking($chr,$chrID, \%PAIR, #creation of all links from chromosome coordinates of pairs + $CONF{general}{read_lengths}, + $CONF{detection}{window_size}, + $CONF{detection}{step_length}, + $tmp_mates_prefix, + $CONF{general}{input_format}, + $CONF{general}{sv_type}, + "$tmp_links_prefix.links.mapped" + ); + + getUniqueLinks("$tmp_links_prefix.links.mapped", #remove the doublons + "$tmp_links_prefix.links.unique"); + + defineCoordsLinks($chr,$chrID, \%PAIR, #definition of the precise coordinates of links + $CONF{general}{input_format}, + $CONF{general}{sv_type}, + $CONF{general}{read_lengths}, + "$tmp_links_prefix.links.unique", + "$tmp_links_prefix.links.unique_defined"); + + sortLinks("$tmp_links_prefix.links.unique_defined", #sorting links from coordinates + "$tmp_links_prefix.links.sorted"); + + removeFullyOverlappedLinks("$tmp_links_prefix.links.sorted", #remove redundant links + "$tmp_links_prefix.links",1); #file output + + + undef %PAIR; + + unlink("$tmp_links_prefix.links.mapped", + "$tmp_links_prefix.links.unique", + "$tmp_links_prefix.links.unique_defined", + "$tmp_links_prefix.links.sorted"); +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub splitMateFile{ + + my ($chr,$chrID,$files_list,$output_prefix,$sv_type,$mates_file,$input_format,$tag_length)=@_; + + print LOG "# Splitting the mate file \"$mates_file\" for parallel processing...\n"; + + my %filesHandle; + + #fichier matefile inter + if($sv_type=~/^(all|inter)$/){ + my $newFileName="$output_prefix.interchrs"; + push(@{$files_list},$newFileName); + my $fh = new FileHandle; + $fh->open(">$newFileName"); + $filesHandle{inter}=$fh; + } + + #fichiers matefiles intra + if($sv_type=~/^(all|intra)$/){ + foreach my $k (1..$chr->{nb_chrs}){ + my $newFileName=$output_prefix.".".$chr->{$k}->{name}; + push(@{$files_list},$newFileName); + my $fh = new FileHandle; + $fh->open(">$newFileName"); + $filesHandle{$k}=$fh; + } + } + + if ($mates_file =~ /.gz$/) { + open(MATES, "gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; #gzcat + }elsif($mates_file =~ /.bam$/){ + open(MATES, "$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY + }else{ + open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n"; + } + + + while(){ + + my @t=split; + my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1,$end_order_read2); + + next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2, \$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length)); + + next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2}); + + ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); + + if( ($sv_type=~/^(all|inter)$/) && ($chr_read1 ne $chr_read2) ){ + my $fh2print=$filesHandle{inter}; + print $fh2print join("\t",@t)."\n"; + } + + if( ($sv_type=~/^(all|intra)$/) && ($chr_read1 eq $chr_read2) ){ + my $fh2print=$filesHandle{$chr_read1}; + print $fh2print join("\t",@t)."\n"; + + } + } + + foreach my $name (keys %filesHandle){ + my $fh=$filesHandle{$name}; + $fh->close; + } + + print LOG "# Splitted mate files of \"$mates_file\" created.\n"; +} + + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub filterlinks{ + + my %CHR; + my %CHRID; + my @LINKFILES; + my @FLINKFILES; + + my $output_prefix=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}; + my @path=split(/\//,$output_prefix); + $output_prefix=$CONF{general}{output_dir}.$path[$#path]; + my $tmp_links_prefix=$CONF{general}{tmp_dir}."links/".$path[$#path]; + + createChrHashTables(\%CHR,\%CHRID, + $CONF{general}{cmap_file}); + + if($CONF{filtering}{split_link_file}){ + + splitLinkFile(\%CHR, \%CHRID, \@LINKFILES, + $tmp_links_prefix, + $CONF{general}{sv_type}, + "$output_prefix.links", + ); + }else{ + + @LINKFILES=qx{ls $tmp_links_prefix*links} or die "# Error: No splitted link files already created\n"; + chomp(@LINKFILES); + print LOG "# Splitted link files already created.\n"; + } + + #Parallelization of the filtering per chromosome for intra + interchrs + my $pm = new Parallel::ForkManager($CONF{general}{num_threads}); + + foreach my $linkfile (@LINKFILES){ + + my $pid = $pm->start and next; + getFilteredlinks(\%CHR, \%CHRID, $linkfile); + $pm->finish; + + } + $pm->wait_all_children; + + #Merge the chromosome links file into only one + @FLINKFILES= qx{ls $tmp_links_prefix*filtered} or die "# Error: No links files created\n"; + chomp(@FLINKFILES); + catFiles( \@FLINKFILES => "$output_prefix.links.filtered" ); + + system "rm $pt_flinks_file; ln -s $output_prefix.links.filtered $pt_flinks_file" if (defined $pt_flinks_file); #GALAXY + print LOG"# Filtering end procedure : output created: $output_prefix.links.filtered\n"; + + undef %CHR; + undef %CHRID; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub splitLinkFile{ + + my ($chr,$chrID,$files_list,$input_prefix,$sv_type,$link_file)=@_; + + print LOG "# Splitting the link file for parallel processing...\n"; + + my %filesHandle; + + #fichier matefile inter + if($sv_type=~/^(all|inter)$/){ + my $newFileName="$input_prefix.interchrs.links"; + push(@{$files_list},$newFileName); + my $fh = new FileHandle; + $fh->open(">$newFileName"); + $filesHandle{inter}=$fh; + } + + #fichiers matefiles intra + if($sv_type=~/^(all|intra)$/){ + foreach my $k (1..$chr->{nb_chrs}){ + my $newFileName=$input_prefix.".".$chr->{$k}->{name}.".links"; + push(@{$files_list},$newFileName); + my $fh = new FileHandle; + $fh->open(">$newFileName"); + $filesHandle{$k}=$fh; + } + } + + open LINKS, "<".$link_file or die "$0: can't open ".$link_file.":$!\n"; + while(){ + + my @t=split; + my ($chr_read1,$chr_read2)=($t[0],$t[3]); + + next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2}); + + ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); + + if( ($sv_type=~/^(all|inter)$/) && ($chr_read1 ne $chr_read2) ){ + my $fh2print=$filesHandle{inter}; + print $fh2print join("\t",@t)."\n"; + } + + if( ($sv_type=~/^(all|intra)$/) && ($chr_read1 eq $chr_read2) ){ + my $fh2print=$filesHandle{$chr_read1}; + print $fh2print join("\t",@t)."\n"; + + } + } + + foreach my $name (keys %filesHandle){ + my $fh=$filesHandle{$name}; + $fh->close; + } + + print LOG "# Splitted link files created.\n"; +} + + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#MAIN FUNCTION number 2: Filtering processing +sub getFilteredlinks { + + my ($chr,$chrID,$tmp_links_prefix)=@_; + my %PAIR; + + strandFiltering($chr,$chrID, + $CONF{filtering}{nb_pairs_threshold}, #filtering of links + $CONF{filtering}{strand_filtering}, + $CONF{filtering}{chromosomes}, + $CONF{general}{input_format}, + $CONF{general}{cmap_file}, + $CONF{general}{mates_orientation}, + $CONF{general}{read_lengths}, + $tmp_links_prefix, + "$tmp_links_prefix.filtered", + ); + + if($CONF{filtering}{strand_filtering}){ #re-definition of links coordinates with strand filtering + + my @tmpfiles; + + rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_unique"); + + getUniqueLinks("$tmp_links_prefix.filtered_unique", + "$tmp_links_prefix.filtered"); + + push(@tmpfiles,"$tmp_links_prefix.filtered_unique"); + + if($CONF{filtering}{order_filtering}){ #filtering using the order + + rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_ordered"); + + orderFiltering($chr,$chrID, + $CONF{filtering}{nb_pairs_threshold}, + $CONF{filtering}{nb_pairs_order_threshold}, + $CONF{filtering}{mu_length}, + $CONF{filtering}{sigma_length}, + $CONF{general}{mates_orientation}, + $CONF{general}{read_lengths}, + "$tmp_links_prefix.filtered_ordered", + "$tmp_links_prefix.filtered", + ); + + push(@tmpfiles,"$tmp_links_prefix.filtered_ordered"); + } + + if (($CONF{filtering}{insert_size_filtering})&& + ($CONF{general}{sv_type} ne 'inter')){ + + rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_withoutIndelSize"); + + addInsertionInfo($chr,$chrID, + $CONF{filtering}{nb_pairs_threshold}, + $CONF{filtering}{order_filtering}, + $CONF{filtering}{indel_sigma_threshold}, + $CONF{filtering}{dup_sigma_threshold}, + $CONF{filtering}{singleton_sigma_threshold}, + $CONF{filtering}{mu_length}, + $CONF{filtering}{sigma_length}, + $CONF{general}{mates_orientation}, + $CONF{general}{read_lengths}, + "$tmp_links_prefix.filtered_withoutIndelSize", + "$tmp_links_prefix.filtered" + ); + + push(@tmpfiles,"$tmp_links_prefix.filtered_withoutIndelSize"); + } + + sortLinks("$tmp_links_prefix.filtered", + "$tmp_links_prefix.filtered_sorted"); + + removeFullyOverlappedLinks("$tmp_links_prefix.filtered_sorted", + "$tmp_links_prefix.filtered_nodup", + ); + + postFiltering("$tmp_links_prefix.filtered_nodup", + "$tmp_links_prefix.filtered", + $CONF{filtering}{final_score_threshold}); + + push(@tmpfiles,"$tmp_links_prefix.filtered_sorted","$tmp_links_prefix.filtered_nodup"); + + unlink(@tmpfiles); + + + } + undef %PAIR; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#MAIN FUNCTION number 3: Circos format conversion for links +sub links2circos{ + + my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered"; + my @path=split(/\//,$input_file); + $input_file=$CONF{general}{output_dir}.$path[$#path]; + + my $output_file.=$input_file.".segdup.txt"; + + links2segdup($CONF{circos}{organism_id}, + $CONF{circos}{colorcode}, + $input_file, + $output_file); #circos file output + + system "rm $pt_circos_file; ln -s $output_file $pt_circos_file" if (defined $pt_circos_file); #GALAXY +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#MAIN FUNCTION number 4: Bed format conversion for links +sub links2bed{ + + my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered"; + my @path=split(/\//,$input_file); + $input_file=$CONF{general}{output_dir}.$path[$#path]; + + my $output_file.=$input_file.".bed.txt"; + + links2bedfile($CONF{general}{read_lengths}, + $CONF{bed}{colorcode}, + $input_file, + $output_file); #bed file output + + system "rm $pt_bed_file; ln -s $output_file $pt_bed_file" if (defined $pt_bed_file); #GALAXY + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#MAIN FUNCTION number 6: Bed format conversion for links +sub links2SV{ + + my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered"; + + my @path=split(/\//,$input_file); + $input_file=$CONF{general}{output_dir}.$path[$#path]; + + my $output_file.=$input_file.".sv.txt"; + + + links2SVfile( $input_file, + $output_file); + + system "rm $pt_sv_file; ln -s $output_file $pt_sv_file" if (defined $pt_sv_file); #GALAXY +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#MAIN FUNCTION number 7: copy number variations, coverage ratio calculation +sub cnv{ + + my %CHR; + my %CHRID; + my @MATEFILES; + my @MATEFILES_REF; + + my $output_prefix=$CONF{general}{mates_file}; + my $output_prefix_ref=$CONF{detection}{mates_file_ref}; + my @path=split(/\//,$output_prefix); + my @path_ref=split(/\//,$output_prefix_ref); + $output_prefix=$CONF{general}{output_dir}.$path[$#path]; + $output_prefix_ref=$CONF{general}{output_dir}.$path_ref[$#path_ref]; + my $tmp_mates_prefix=$CONF{general}{tmp_dir}."mates/".$path[$#path]; + my $tmp_mates_prefix_ref=$CONF{general}{tmp_dir}."mates/".$path_ref[$#path_ref]; + my $tmp_density_prefix=$CONF{general}{tmp_dir}."density/".$path[$#path]; + + shearingChromosome(\%CHR, \%CHRID, #making the genomic fragment library with the detection parameters + $CONF{detection}{window_size}, + $CONF{detection}{step_length}, + $CONF{general}{cmap_file}); + + if($CONF{detection}{split_mate_file}){ + + splitMateFile(\%CHR, \%CHRID, \@MATEFILES, $tmp_mates_prefix, + "intra", + $CONF{general}{mates_file}, + $CONF{general}{input_format}, + $CONF{general}{read_lengths} + ); + + splitMateFile(\%CHR, \%CHRID, \@MATEFILES_REF, $tmp_mates_prefix_ref, + "intra", + $CONF{detection}{mates_file_ref}, + $CONF{general}{input_format}, + $CONF{general}{read_lengths} + ); + + + }else{ + + @MATEFILES=qx{ls $tmp_mates_prefix*} or die "# Error: No splitted sample mate files of \"$CONF{general}{mates_file}\" already created at $CONF{general}{tmp_dir} :$!"; + chomp(@MATEFILES); + @MATEFILES_REF=qx{ls $tmp_mates_prefix_ref*} or die "# Error: No splitted reference mate files of \"$CONF{detection}{mates_file_ref}\" already created at $CONF{general}{tmp_dir} :$!"; + chomp(@MATEFILES_REF); + print LOG "# Splitted sample and reference mate files already created.\n"; + } + + #Parallelization of the cnv per chromosome + my $pm = new Parallel::ForkManager($CONF{general}{num_threads}); + + foreach my $file (0..$#MATEFILES){ + + my $pid = $pm->start and next; + + densityCalculation(\%CHR, \%CHRID, $file, + $CONF{general}{read_lengths}, + $CONF{detection}{window_size}, + $CONF{detection}{step_length}, + \@MATEFILES, + \@MATEFILES_REF, + $MATEFILES[$file].".density", + $CONF{general}{input_format}); + + $pm->finish; + + } + $pm->wait_all_children; + + #Merge the chromosome links file into only one + my @DENSITYFILES= qx{ls $tmp_density_prefix*density} or die "# Error: No density files created at $CONF{general}{tmp_dir} :$!"; + chomp(@DENSITYFILES); + catFiles( \@DENSITYFILES => "$output_prefix.density" ); + + print LOG "# cnv end procedure : output created: $output_prefix.density\n"; + + + undef %CHR; + undef %CHRID; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#MAIN FUNCTION number 8: Circos format conversion for cnv ratios +sub ratio2circos{ + + my $input_file=$CONF{general}{mates_file}.".density"; + + my @path=split(/\//,$input_file); + $input_file=$CONF{general}{output_dir}.$path[$#path]; + + my $output_file.=$input_file.".segdup.txt"; + + ratio2segdup($CONF{circos}{organism_id}, + $input_file, + $output_file); +} +#------------------------------------------------------------------------------# +#MAIN FUNCTION number 9: BedGraph format conversion for cnv ratios +sub ratio2bedgraph{ + + my $input_file=$CONF{general}{mates_file}.".density"; + + my @path=split(/\//,$input_file); + $input_file=$CONF{general}{output_dir}.$path[$#path]; + + my $output_file.=$input_file.".bedgraph.txt"; + + ratio2bedfile($input_file, + $output_file); #bed file output +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#Creation of the fragment library +sub shearingChromosome{ + + print LOG "# Making the fragments library...\n"; + + my ($chr,$chrID,$window,$step,$cmap_file)=@_; #window and step sizes parameters + + createChrHashTables($chr,$chrID,$cmap_file); #hash tables: chromosome ID <=> chromsomes Name + + foreach my $k (1..$chr->{nb_chrs}){ + + print LOG"-- $chr->{$k}->{name}\n"; + + my $frag=1; + for (my $start=0; $start<$chr->{$k}->{length}; $start+=$step){ + + my $end=($start<($chr->{$k}->{length})-$window)? $start+$window-1:($chr->{$k}->{length})-1; + $chr->{$k}->{$frag}=[$start,$end]; #creation of fragments, coordinates storage + + if($end==($chr->{$k}->{length})-1){ + $chr->{$k}->{nb_frag}=$frag; #nb of fragments per chromosome + last; + } + $frag++; + } + } +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#Creation of chromosome hash tables from the cmap file +sub createChrHashTables{ + + my ($chr,$chrID,$cmap_file)=@_; + $chr->{nb_chrs}=0; + + open CMAP, "<".$cmap_file or die "$0: can't open ".$cmap_file.":$!\n"; + while(){ + + if(/^\s+$/){ next;} + my ($k,$name,$length) = split; + $chr->{$k}->{name}=$name; + $chr->{$k}->{length}=$length; + $chrID->{$name}=$k; + $chr->{nb_chrs}++; + + } + close CMAP; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#Read the mate file according the input format file (solid, eland or sam) +sub readMateFile{ + + my ($chr1,$chr2,$pos1,$pos2,$order1,$order2,$t,$file_type,$tag_length)=@_; + my ($strand1,$strand2); + + if($file_type eq "solid"){ + + ($$chr1,$$chr2,$$pos1,$$pos2,$$order1,$$order2)=($$t[6],$$t[7],$$t[8]+1,$$t[9]+1,1,2); #0-based + + }else{ + my ($tag_length1,$tag_length2); + ($$chr1,$$chr2,$$pos1,$strand1,$$pos2,$strand2,$$order1,$$order2,$tag_length1,$tag_length2)=($$t[11],$$t[12],$$t[7],$$t[8],$$t[9],$$t[10],1,2,length($$t[1]),length($$t[2])) #1-based + if($file_type eq "eland"); + + if($file_type eq "sam"){ + + return 0 if ($$t[0]=~/^@/); #header sam filtered out + + ($$chr1,$$chr2,$$pos1,$$pos2)=($$t[2],$$t[6],$$t[3],$$t[7]); + + return 0 if (($$t[1]&0x0004) || ($$t[1]&0x0008)); + + $$chr2=$$chr1 if($$chr2 eq "="); + + $strand1 = (($$t[1]&0x0010))? 'R':'F'; + $strand2 = (($$t[1]&0x0020))? 'R':'F'; + + $$order1= (($$t[1]&0x0040))? '1':'2'; + $$order2= (($$t[1]&0x0080))? '1':'2'; + $tag_length1 = $tag_length->{$$order1}; + $tag_length2 = $tag_length->{$$order2}; + } + + $$pos1 = -($$pos1+$tag_length1) if ($strand1 eq "R"); #get sequencing starts + $$pos2 = -($$pos2+$tag_length2) if ($strand2 eq "R"); + } + return 1; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#Parsing of the mates files and creation of links between 2 chromosomal fragments +sub linking{ + + my ($chr,$chrID,$pair,$tag_length,$window_dist,$step,$mates_file,$input_format,$sv_type,$links_file)=@_; + my %link; + + my $record=0; + my $nb_links=0; + my $warn=10000; + + my @sfile=split(/\./,$mates_file); + my $fchr=$sfile[$#sfile]; + + my $fh = new FileHandle; + + print LOG "# $fchr : Linking procedure...\n"; + print LOG "-- file=$mates_file\n". + "-- chromosome=$fchr\n". + "-- input format=$input_format\n". + "-- type=$sv_type\n". + "-- read1 length=$tag_length->{1}, read2 length=$tag_length->{2}\n". + "-- window size=$window_dist, step length=$step\n"; + + if ($mates_file =~ /.gz$/) { + $fh->open("gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; #gzcat + }elsif($mates_file =~ /.bam$/){ + $fh->open("$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY + }else{ + $fh->open("<".$mates_file) or die "$0: can't open ".$mates_file.":$!\n"; + } + + + while(<$fh>){ + + my @t=split; #for each mate-pair + my $mate=$t[0]; + my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1,$end_order_read2); + + next if(exists $$pair{$mate}); + + next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2, \$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length)); + + next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2}); + + ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); + + if($sv_type ne "all"){ + if( ($sv_type eq "inter") && ($chr_read1 ne $chr_read2) || + ($sv_type eq "intra") && ($chr_read1 eq $chr_read2) ){ + }else{ + next; + } + } + + $$pair{$mate}=[$chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1, $end_order_read2 ]; #fill out the hash pair table (ready for the defineCoordsLinks function) + + $record++; + + my ($coord_start_read1,$coord_end_read1,$coord_start_read2,$coord_end_read2); #get the coordinates of each read + + recupCoords($firstbase_read1,\$coord_start_read1,\$coord_end_read1,$tag_length->{$end_order_read1},$input_format); + recupCoords($firstbase_read2,\$coord_start_read2,\$coord_end_read2,$tag_length->{$end_order_read2},$input_format); + + for(my $i=1;$i<=$chr->{$chr_read1}->{'nb_frag'};$i++){ #fast genome parsing for link creation + + if (abs ($coord_start_read1-${$chr->{$chr_read1}->{$i}}[0]) <= $window_dist){ + + if(overlap($coord_start_read1,$coord_end_read1,${$chr->{$chr_read1}->{$i}}[0],${$chr->{$chr_read1}->{$i}}[1])){ + + for(my $j=1;$j<=$chr->{$chr_read2}->{'nb_frag'};$j++){ + + if (abs ($coord_start_read2-${$chr->{$chr_read2}->{$j}}[0]) <= $window_dist) { + + if(overlap($coord_start_read2,$coord_end_read2,${$chr->{$chr_read2}->{$j}}[0],${$chr->{$chr_read2}->{$j}}[1])){ + + makeLink(\%link,$chr_read1,$i,$chr_read2,$j,$mate,\$nb_links); #make the link + } + + }else{ + + $j=getNextFrag($coord_start_read2,$j,${$chr->{$chr_read2}->{$j}}[0],$chr->{$chr_read2}->{nb_frag},$window_dist,$step); + } + } + } + + }else{ + + $i=getNextFrag($coord_start_read1,$i,${$chr->{$chr_read1}->{$i}}[0],$chr->{$chr_read1}->{nb_frag},$window_dist,$step); + } + } + + if($record>=$warn){ + print LOG "-- $fchr : $warn mate-pairs analysed - $nb_links links done\n"; + $warn+=10000; + } + } + $fh->close; + + if(!$nb_links){ + print LOG "-- $fchr : No mate-pairs !\n". + "-- $fchr : No links have been found with the selected type of structural variations \($sv_type\)\n"; + } + + print LOG "-- $fchr : Total : $record mate-pairs analysed - $nb_links links done\n"; + + print LOG "-- $fchr : writing...\n"; + + $fh = new FileHandle; + + $fh->open(">".$links_file) or die "$0: can't write in the output ".$links_file." :$!\n"; + + foreach my $chr1 ( sort { $a <=> $b} keys %link){ #Sorted links output + + foreach my $chr2 ( sort { $a <=> $b} keys %{$link{$chr1}}){ + + foreach my $frag1 ( sort { $a <=> $b} keys %{$link{$chr1}{$chr2}}){ + + foreach my $frag2 ( sort { $a <=> $b} keys %{$link{$chr1}{$chr2}{$frag1}}){ + + my @count=split(",",$link{$chr1}{$chr2}{$frag1}{$frag2}); + print $fh "$chr->{$chr1}->{name}\t".(${$chr->{$chr1}->{$frag1}}[0]+1)."\t".(${$chr->{$chr1}->{$frag1}}[1]+1)."\t". + "$chr->{$chr2}->{name}\t".(${$chr->{$chr2}->{$frag2}}[0]+1)."\t".(${$chr->{$chr2}->{$frag2}}[1]+1)."\t". + scalar @count."\t". #nb of read + $link{$chr1}{$chr2}{$frag1}{$frag2}."\n"; #mate list + } + } + } + } + + $fh->close; + + undef %link; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#remove exact links doublons according to the mate list +sub getUniqueLinks{ + + my ($links_file,$nrlinks_file)=@_; + my %links; + my %pt; + my $nb_links; + my $n=1; + + my $record=0; + my $warn=300000; + + my @sfile=split(/\./,$links_file); + my $fchr=$sfile[$#sfile-2]; + + my $fh = new FileHandle; + + print LOG "# $fchr : Getting unique links...\n"; + $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; + + while(<$fh>){ + + my @t=split; + my $mates=$t[7]; + $record++; + + if(!exists $links{$mates}){ #Unique links selection + + $links{$mates}=[@t]; + $pt{$n}=$links{$mates}; + $n++; + + + }else{ #get the link coordinates from the mate-pairs list + + for my $i (1,2,4,5){ #get the shortest regions + + $links{$mates}->[$i]=($t[$i]>$links{$mates}->[$i])? $t[$i]:$links{$mates}->[$i] #maximum start + if($i==1 || $i==4); + $links{$mates}->[$i]=($t[$i]<$links{$mates}->[$i])? $t[$i]:$links{$mates}->[$i] #minimum end + if($i==2 || $i==5); + } + } + if($record>=$warn){ + print LOG "-- $fchr : $warn links analysed - ".($n-1)." unique links done\n"; + $warn+=300000; + } + } + $fh->close; + + $nb_links=$n-1; + print LOG "-- $fchr : Total : $record links analysed - $nb_links unique links done\n"; + + $fh = new FileHandle; + $fh->open(">$nrlinks_file") or die "$0: can't write in the output: $nrlinks_file :$!\n"; + print LOG "-- $fchr : writing...\n"; + for my $i (1..$nb_links){ + + print $fh join("\t",@{$pt{$i}})."\n"; #all links output + } + + $fh->close; + + undef %links; + undef %pt; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#get the new coordinates of each link from the mate list +sub defineCoordsLinks{ + + my ($chr,$chrID,$pair,$input_format,$sv_type,$tag_length,$links_file,$clinks_file)=@_; + + my @sfile=split(/\./,$links_file); + my $fchr=$sfile[$#sfile-2]; + + my $fh = new FileHandle; + my $fh2 = new FileHandle; + + $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; + $fh2->open(">$clinks_file") or die "$0: can't write in the output: $clinks_file :$!\n"; + + print LOG "# $fchr : Defining precise link coordinates...\n"; + + my $record=0; + my $warn=100000; + + my %coords; + my %strands; + my %order; + my %ends_order; + + while(<$fh>){ + + + my ($col1,$col2)=(1,2); #for an intrachromosomal link + my $diffchr=0; #difference between chr1 and chr2 + my ($chr1,$chr2,$mates_list,$npairs)=(split)[0,3,7,8]; + ($chr1,$chr2) = ($chrID->{$chr1},$chrID->{$chr2}); + if ($chr1 != $chr2){ #for an interchromosomal link + $col1=$col2=0; #no distinction + $diffchr=1; + } + + my @pairs=split(",",$mates_list); + + $coords{$col1}{$chr1}->{start}=undef; + $coords{$col1}{$chr1}->{end}=undef; + $coords{$col2}{$chr2}->{start}=undef; + $coords{$col2}{$chr2}->{end}=undef; + $strands{$col1}{$chr1}=undef; + $strands{$col2}{$chr2}=undef; + $ends_order{$col1}{$chr1}=undef; + $ends_order{$col2}{$chr2}=undef; + + + $order{$col1}{$chr1}->{index}->{1}=undef; + $order{$col1}{$chr1}->{index}->{2}=undef; + $order{$col2}{$chr2}->{index}->{1}=undef; + $order{$col2}{$chr2}->{index}->{2}=undef; + $order{$col1}{$chr1}->{order}=undef; + $order{$col2}{$chr2}->{order}=undef; + + $record++; + + for my $p (0..$#pairs){ #for each pair + + my ($coord_start_read1,$coord_end_read1); + my ($coord_start_read2,$coord_end_read2); + my $strand_read1=recupCoords(${$$pair{$pairs[$p]}}[2],\$coord_start_read1,\$coord_end_read1,$tag_length->{${$$pair{$pairs[$p]}}[4]},$input_format); + my $strand_read2=recupCoords(${$$pair{$pairs[$p]}}[3],\$coord_start_read2,\$coord_end_read2,$tag_length->{${$$pair{$pairs[$p]}}[5]},$input_format); + + if(!$diffchr){ #for a intrachromosomal link + if($coord_start_read2<$coord_start_read1){ #get the closer start coordinate for each column + ($col1,$col2)=(2,1); + }else{ + ($col1,$col2)=(1,2); + } + } + + push(@{$coords{$col1}{${$$pair{$pairs[$p]}}[0]}->{start}},$coord_start_read1); #get coords and strands of f3 and r3 reads + push(@{$coords{$col1}{${$$pair{$pairs[$p]}}[0]}->{end}},$coord_end_read1); + push(@{$coords{$col2}{${$$pair{$pairs[$p]}}[1]}->{start}},$coord_start_read2); + push(@{$coords{$col2}{${$$pair{$pairs[$p]}}[1]}->{end}},$coord_end_read2); + push(@{$strands{$col1}{${$$pair{$pairs[$p]}}[0]}},$strand_read1); + push(@{$strands{$col2}{${$$pair{$pairs[$p]}}[1]}},$strand_read2); + push(@{$ends_order{$col1}{${$$pair{$pairs[$p]}}[0]}},${$$pair{$pairs[$p]}}[4]); + push(@{$ends_order{$col2}{${$$pair{$pairs[$p]}}[1]}},${$$pair{$pairs[$p]}}[5]); + } + + ($col1,$col2)=(1,2) if(!$diffchr); + + my $coord_start_chr1=min(min(@{$coords{$col1}{$chr1}->{start}}),min(@{$coords{$col1}{$chr1}->{end}})); #get the biggest region + my $coord_end_chr1=max(max(@{$coords{$col1}{$chr1}->{start}}),max(@{$coords{$col1}{$chr1}->{end}})); + my $coord_start_chr2=min(min(@{$coords{$col2}{$chr2}->{start}}),min(@{$coords{$col2}{$chr2}->{end}})); + my $coord_end_chr2=max(max(@{$coords{$col2}{$chr2}->{start}}),max(@{$coords{$col2}{$chr2}->{end}})); + + @{$order{$col1}{$chr1}->{index}->{1}}= sort {${$coords{$col1}{$chr1}->{start}}[$a] <=> ${$coords{$col1}{$chr1}->{start}}[$b]} 0 .. $#{$coords{$col1}{$chr1}->{start}}; + @{$order{$col2}{$chr2}->{index}->{1}}= sort {${$coords{$col2}{$chr2}->{start}}[$a] <=> ${$coords{$col2}{$chr2}->{start}}[$b]} 0 .. $#{$coords{$col2}{$chr2}->{start}}; + + foreach my $i (@{$order{$col1}{$chr1}->{index}->{1}}){ #get the rank of the chr2 reads according to the sorted chr1 reads (start coordinate sorting) + foreach my $j (@{$order{$col2}{$chr2}->{index}->{1}}){ + + if(${$order{$col1}{$chr1}->{index}->{1}}[$i] == ${$order{$col2}{$chr2}->{index}->{1}}[$j]){ + ${$order{$col1}{$chr1}->{index}->{2}}[$i]=$i; + ${$order{$col2}{$chr2}->{index}->{2}}[$i]=$j; + last; + } + } + } + + foreach my $i (@{$order{$col1}{$chr1}->{index}->{2}}){ #use rank chr1 as an ID + foreach my $j (@{$order{$col2}{$chr2}->{index}->{2}}){ + + if(${$order{$col1}{$chr1}->{index}->{2}}[$i] == ${$order{$col2}{$chr2}->{index}->{2}}[$j]){ + ${$order{$col1}{$chr1}->{order}}[$i]=$i+1; + ${$order{$col2}{$chr2}->{order}}[$i]=$j+1; + last; + } + } + } + + @pairs=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},\@pairs);#sorting of the pairs, strands, and start coords from the sorted chr2 reads + @{$strands{$col1}{$chr1}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$strands{$col1}{$chr1}); + @{$strands{$col2}{$chr2}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$strands{$col2}{$chr2}); + @{$ends_order{$col1}{$chr1}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$ends_order{$col1}{$chr1}); + @{$ends_order{$col2}{$chr2}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$ends_order{$col2}{$chr2}); + @{$coords{$col1}{$chr1}->{start}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$coords{$col1}{$chr1}->{start}); + @{$coords{$col2}{$chr2}->{start}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$coords{$col2}{$chr2}->{start}); + + + my @link=($chr->{$chr1}->{name}, $coord_start_chr1 , $coord_end_chr1, #all information output + $chr->{$chr2}->{name}, $coord_start_chr2 , $coord_end_chr2, + scalar @pairs, + join(",",@pairs), + join(",",@{$strands{$col1}{$chr1}}), + join(",",@{$strands{$col2}{$chr2}}), + join(",",@{$ends_order{$col1}{$chr1}}), + join(",",@{$ends_order{$col2}{$chr2}}), + join(",",@{$order{$col1}{$chr1}->{order}}), + join(",",@{$order{$col2}{$chr2}->{order}}), + join(",",@{$coords{$col1}{$chr1}->{start}}), + join(",",@{$coords{$col2}{$chr2}->{start}})); + + print $fh2 join("\t",@link)."\n"; + + if($record>=$warn){ + print LOG "-- $fchr : $warn links processed\n"; + $warn+=100000; + } + } + $fh->close; + $fh2->close; + + print LOG "-- $fchr : Total : $record links processed\n"; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#Sort links according the concerned chromosomes and their coordinates +sub sortLinks{ + + my ($links_file,$sortedlinks_file,$unique)=@_; + + my @sfile=split(/\./,$links_file); + my $fchr=$sfile[$#sfile-2]; + + + print LOG "# $fchr : Sorting links...\n"; + + my $pipe=($unique)? "| sort -u":""; + system "sort -k 1,1 -k 4,4 -k 2,2n -k 5,5n -k 8,8n $links_file $pipe > $sortedlinks_file"; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#removal of fully overlapped links +sub removeFullyOverlappedLinks{ + + my ($links_file,$nrlinks_file,$warn_out)=@_; + + my %pt; + my $n=1; + + my @sfile=split(/\./,$links_file); + my $fchr=$sfile[$#sfile-2]; + + my $fh = new FileHandle; + + $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; + while(<$fh>){ + + my @t=split("\t",$_); + $pt{$n}=[@t]; + $n++; + } + $fh->close; + + my $nb_links=$n-1; + my $nb=$nb_links; + + my %pt2; + my $nb2=1; + my $record=0; + my $warn=10000; + + print LOG "# $fchr : Removing fully overlapped links...\n"; + + LINK: + + for my $i (1..$nb){ + + my @link=(); + my @next_link=(); + my $ind1=$i; + + $record++; + if($record>=$warn){ + print LOG "-- $fchr : $warn unique links analysed - ".($nb2-1)." non-overlapped links done\n"; + $warn+=10000; + } + + if(exists $pt{$ind1}){ + @link=@{$pt{$ind1}}; #link1 + }else{ + next LINK; + } + + my ($chr1,$start1,$end1,$chr2,$start2,$end2)=($link[0],$link[1],$link[2],$link[3],$link[4],$link[5]); #get info of link1 + my @mates=deleteBadOrderSensePairs(split(",",$link[7])); + + my $ind2=$ind1+1; + $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb); #get the next found link + + if($ind2<=$nb){ + + @next_link=@{$pt{$ind2}}; #link2 + my ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]); #get info of link2 + my @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7])); + + while(($chr1 eq $chr3 && $chr2 eq $chr4) && overlap($start1,$end1,$start3,$end3)){ #loop here according to the chr1 coordinates, need an overlap between links to enter + + if(!overlap($start2,$end2,$start4,$end4)){ #if no overlap with chr2 coordinates ->next link2 + + $ind2++; + $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb); + + if($ind2>$nb){ #if no more link in the file -> save link1 + + $pt2{$nb2}=\@link; + $nb2++; + next LINK; + } + + @next_link=@{$pt{$ind2}}; + ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]); + @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7])); + next; + } + + my %mates=map{$_ =>1} @mates; #get the equal number of mates + my @same_mates = grep( $mates{$_}, @next_mates ); + my $nb_mates= scalar @same_mates; + + if($nb_mates == scalar @mates){ + + delete $pt{$ind1}; #if pairs of link 1 are all included in link 2 -> delete link1 + next LINK; #go to link2, link2 becomes link1 + + }else{ + delete $pt{$ind2} if($nb_mates == scalar @next_mates); #if pairs of link2 are all included in link 1 -> delete link2 + $ind2++; #we continue by checking the next link2 + $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb); + + if($ind2>$nb){ #if no more link in the file -> save link1 + + $pt2{$nb2}=\@link; + $nb2++; + next LINK; + } + + @next_link=@{$pt{$ind2}}; #get info of link2 + ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]); + @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7])); + + } + } + } + $pt2{$nb2}=\@link; #if no (more) link with chr1 coordinates overlap -> save link1 + $nb2++; + } + + print LOG "-- $fchr : Total : $nb_links unique links analysed - ".($nb2-1)." non-overlapped links done\n"; + + #OUTPUT + + $fh = new FileHandle; + $fh->open(">$nrlinks_file") or die "$0: can't write in the output: $nrlinks_file :$!\n"; + print LOG "-- $fchr : writing...\n"; + for my $i (1..$nb2-1){ + + print $fh join("\t",@{$pt2{$i}}); #all links output + } + + close $fh; + + print LOG "-- $fchr : output created: $nrlinks_file\n" if($warn_out); + + undef %pt; + undef %pt2; +} +#------------------------------------------------------------------------------# +sub postFiltering { + + my ($links_file,$pflinks_file, $finalScore_thres)=@_; + + my @sfile=split(/\./,$links_file); + my $fchr=$sfile[$#sfile-2]; + + + my ($nb,$nb2)=(0,0); + + print LOG "# $fchr : Post-filtering links...\n"; + print LOG "-- $fchr : final score threshold = $finalScore_thres\n"; + + my $fh = new FileHandle; + my $fh2 = new FileHandle; + + $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; + $fh2->open(">$pflinks_file") or die "$0: can't write in the output: $pflinks_file :$!\n"; + + + while(<$fh>){ + + my @t=split("\t",$_); + my $score=$t[$#t-1]; + + if($score >= $finalScore_thres){ + print $fh2 join("\t", @t); + $nb2++; + } + $nb++; + } + $fh->close; + $fh2->close; + + print LOG "-- $fchr : Total : $nb unique links analysed - $nb2 links kept\n"; + print LOG "-- $fchr : output created: $pflinks_file\n"; +} + + + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#Filtering of the links +sub strandFiltering{ + + my($chr,$chrID,$pairs_threshold,$strand_filtering,$chromosomes, + $input_format,$cmap_file,$mate_sense, $tag_length,$links_file,$flinks_file)=@_; + + my @sfile=split(/\./,$links_file); + my $fchr=$sfile[$#sfile-1]; + + + my %chrs; + my %chrs1; + my %chrs2; + my $nb_chrs; + my $exclude; + + if($chromosomes){ + my @chrs=split(",",$chromosomes); + $nb_chrs=scalar @chrs; + $exclude=($chrs[0]=~/^\-/)? 1:0; + for my $chrName (@chrs){ + $chrName=~s/^(\-)//; + my $col=($chrName=~s/_(1|2)$//); + + if(!$col){ + $chrs{$chrID->{$chrName}}=undef + }else{ + $chrs1{$chrID->{$chrName}}=undef if($1==1); + $chrs2{$chrID->{$chrName}}=undef if($1==2); + } + } + } + + my $record=0; + my $nb_links=0; + my $warn=10000; + + my $sens_ratio_threshold=0.6; + + print LOG "\# Filtering procedure...\n"; + print LOG "\# Number of pairs and strand filtering...\n"; + print LOG "-- file=$links_file\n"; + print LOG "-- nb_pairs_threshold=$pairs_threshold, strand_filtering=".(($strand_filtering)? "yes":"no"). + ", chromosomes=".(($chromosomes)? "$chromosomes":"all")."\n"; + + + + my $fh = new FileHandle; + my $fh2 = new FileHandle; + + $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; + $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n"; + + while(<$fh>){ + + my @t=split; #for each link + my $is_good=1; + $record++; + + + if($chromosomes){ + + my ($chr1,$chr2)=($chrID->{$t[0]},$chrID->{$t[3]}); + + if(!$exclude){ + $is_good=(exists $chrs{$chr1} && exists $chrs{$chr2})? 1:0; + $is_good=(exists $chrs1{$chr1} && exists $chrs2{$chr2})? 1:0 if(!$is_good); + $is_good=($nb_chrs==1 && (exists $chrs1{$chr1} || exists $chrs2{$chr2}))? 1:0 if(!$is_good); + }else{ + $is_good=(exists $chrs{$chr1} || exists $chrs{$chr2})? 0:1; + $is_good=(exists $chrs1{$chr1} || exists $chrs2{$chr2})? 0:1 if($is_good); + } + } + + $is_good = ($is_good && $t[6] >= $pairs_threshold)? 1 :0; #filtering according the number of pairs + if($is_good && $strand_filtering){ #if filtering according the strand sense + + my @mates=split(/,/,$t[7]); #get the concordant pairs in the strand sense + my @strands1=split(/,/,$t[8]); + my @strands2=split(/,/,$t[9]); + + my %mate_class=( 'FF' => 0, 'RR' => 0, 'FR' => 0, 'RF' => 0); + + my %mate_reverse=( 'FF' => 'RR', 'RR' => 'FF', #group1: FF,RR + 'FR' => 'RF', 'RF' => 'FR'); #group2: FR,RF + + my %mate_class2=( $mate_sense=>"NORMAL_SENSE", inverseSense($mate_sense)=>"NORMAL_SENSE", + substr($mate_sense,0,1).inverseSense(substr($mate_sense,1,1))=>"REVERSE_SENSE", + inverseSense(substr($mate_sense,0,1)).substr($mate_sense,1,1)=>"REVERSE_SENSE"); + + if($t[6] == 1){ + + push(@t,$mate_class2{$strands1[0].$strands2[0]},"1/1",1,1); + + }else{ + + tie (my %class,'Tie::IxHash'); + my $split; + + foreach my $i (0..$#mates){ + $mate_class{$strands1[$i].$strands2[$i]}++; #get the over-represented group + } + + my $nb_same_sens_class=$mate_class{FF}+$mate_class{RR}; + my $nb_diff_sens_class=$mate_class{FR}+$mate_class{RF}; + my $sens_ratio=max($nb_same_sens_class,$nb_diff_sens_class)/($nb_same_sens_class+$nb_diff_sens_class); + + if($sens_ratio < $sens_ratio_threshold){ + %class=(1=>'FF', 2=>'FR'); + $split=1; + }else{ + $class{1}=($nb_same_sens_class > $nb_diff_sens_class)? 'FF':'FR'; #if yes get the concerned class + $split=0; + } + + $is_good=getConsistentSenseLinks(\@t,\@mates,\@strands1,\@strands2,$tag_length,$mate_sense,\%mate_reverse,\%mate_class2,\%class,$split,$pairs_threshold); + } + } + + if($is_good){ #PRINT + + my $nb=scalar @t; + if($nb > 20){ + my @t2=splice(@t,0,20); + print $fh2 join("\t",@t2)."\n"; + $nb_links++; + } + $nb_links++; + print $fh2 join("\t",@t)."\n"; + } + + if($record>=$warn){ + print LOG "-- $fchr : $warn links analysed - $nb_links links kept\n"; + $warn+=10000; + } + } + $fh->close; + $fh2->close; + + print LOG "-- $fchr : No links have been found with the selected filtering parameters\n" if(!$nb_links); + + print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n"; + + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub getConsistentSenseLinks{ + + my ($t,$mates,$strands1,$strands2,$tag_length,$mate_sense, $mate_reverse,$mate_class2, $class, $split,$thres)=@_; + + my $npairs=scalar @$mates; + + my @ends_order1 = split (/,/,$$t[10]); + my @ends_order2 = split (/,/,$$t[11]); + my @order1 = split (/,/,$$t[12]); + my @order2 = split (/,/,$$t[13]); + my @positions1 = split (/,/,$$t[14]); + my @positions2 = split (/,/,$$t[15]); + + my @newlink; + + foreach my $ind (keys %{$class} ){ + + tie (my %flink,'Tie::IxHash'); + my @orders2remove=(); + + foreach my $i (0..$#{$mates}){ #get the pairs belonging the over-represented group + + if((($$strands1[$i].$$strands2[$i]) eq $$class{$ind}) || (($$strands1[$i].$$strands2[$i]) eq $$mate_reverse{$$class{$ind}})){ + push(@{$flink{mates}},$$mates[$i]); + push(@{$flink{strands1}},$$strands1[$i]); + push(@{$flink{strands2}},$$strands2[$i]); + push(@{$flink{ends_order1}},$ends_order1[$i]); + push(@{$flink{ends_order2}},$ends_order2[$i]); + push(@{$flink{positions1}},$positions1[$i]); + push(@{$flink{positions2}},$positions2[$i]); + + }else{ + push(@orders2remove,$order1[$i]); + } + } + + @{$flink{order1}}=(); + @{$flink{order2}}=(); + if(scalar @orders2remove > 0){ + getNewOrders(\@order1,\@order2,\@orders2remove,$flink{order1},$flink{order2}) + }else{ + @{$flink{order1}}=@order1; + @{$flink{order2}}=@order2; + } + + my @ends1; getEnds(\@ends1,$flink{positions1},$flink{strands1},$flink{ends_order1},$tag_length); + my @ends2; getEnds(\@ends2,$flink{positions2},$flink{strands2},$flink{ends_order2},$tag_length); + + my $fnpairs=scalar @{$flink{mates}}; + my $strand_filtering_ratio=$fnpairs."/".$npairs; + my $real_ratio=$fnpairs/$npairs; + + if($fnpairs>=$thres){ #filtering according the number of pairs + + push(@newlink, + $$t[0], + min(min(@{$flink{positions1}}),min(@ends1)), + max(max(@{$flink{positions1}}),max(@ends1)), + $$t[3], + min(min(@{$flink{positions2}}),min(@ends2)), + max(max(@{$flink{positions2}}),max(@ends2)), + $fnpairs, + join(",",@{$flink{mates}}), + join(",",@{$flink{strands1}}), + join(",",@{$flink{strands2}}), + join(",",@{$flink{ends_order1}}), + join(",",@{$flink{ends_order2}}), + join(",",@{$flink{order1}}), + join(",",@{$flink{order2}}), + join(",",@{$flink{positions1}}), + join(",",@{$flink{positions2}}), + $$mate_class2{${$flink{strands1}}[0].${$flink{strands2}}[0]}, + $strand_filtering_ratio, + $real_ratio, + $npairs + ); + } + } + + if (grep {defined($_)} @newlink) { + @$t=@newlink; + return 1 + } + return 0; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub getNewOrders{ + + my($tab1,$tab2,$list,$newtab1,$newtab2)=@_; + my $j=1; + my $k=1; + for my $i (0..$#{$tab2}){ + my $c=0; + for my $j (0..$#{$list}){ + $c++ if(${$list}[$j] < ${$tab2}[$i]); + if(${$list}[$j] == ${$tab2}[$i]){ + $c=-1; last; + } + } + if($c!=-1){ + push(@{$newtab2}, ${$tab2}[$i]-$c); + push(@{$newtab1}, $k); + $k++; + } + } +} + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#Filtering of the links using their order +sub orderFiltering { + + my ($chr,$chrID,$nb_pairs_threshold,$nb_pairs_order_threshold,$mu,$sigma,$mate_sense,$tag_length,$links_file,$flinks_file)=@_; + + my @sfile=split(/\./,$links_file); + my $fchr=$sfile[$#sfile-2]; + + + my $diff_sense_ends=(($mate_sense eq "FR") || ($mate_sense eq "RF"))? 1:0; + + my $record=0; + my $warn=10000; + my $nb_links=0; + + my $quant05 = 1.644854; + my $quant001 = 3.090232; + my $alphaDist = $quant05 * 2 * $sigma; + my $maxFragmentLength = &floor($quant001 * $sigma + $mu); + + print LOG "\# Filtering by order...\n"; + print LOG "-- mu length=$mu, sigma length=$sigma, nb pairs order threshold=$nb_pairs_order_threshold\n"; + print LOG "-- distance between comparable pairs was set to $alphaDist\n"; + print LOG "-- maximal fragment length was set to $maxFragmentLength\n"; + + + my $fh = new FileHandle; + my $fh2 = new FileHandle; + + $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; + $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n"; + + while(<$fh>){ + + $record++; + my @t = split; + my ($chr1,$chr2,$mates_list)=@t[0,3,7]; + my @pairs=split(",",$mates_list); + ($chr1,$chr2) = ($chrID->{$chr1},$chrID->{$chr2}); + my ($coord_start_chr1,$coord_end_chr1,$coord_start_chr2,$coord_end_chr2) = @t[1,2,4,5]; + my $numberOfPairs = $t[6]; + my @strand1 = split (/,/,$t[8]); + my @strand2 = split (/,/,$t[9]); + my @ends_order1 = split (/,/,$t[10]); + my @ends_order2 = split (/,/,$t[11]); + my @order1 = split (/,/,$t[12]); + my @order2 = split (/,/,$t[13]); + my @positions1 = split (/,/,$t[14]); + my @positions2 = split (/,/,$t[15]); + my @ends1; getEnds(\@ends1,\@positions1,\@strand1,\@ends_order1,$tag_length); + my @ends2; getEnds(\@ends2,\@positions2,\@strand2,\@ends_order2,$tag_length); + my $clusterCoordinates_chr1; + my $clusterCoordinates_chr2; + my $reads_left = 0; + + my $ifRenv = $t[16]; + my $strand_ratio_filtering=$t[17]; + + #kind of strand filtering. For example, will keep only FFF-RRR from a link FFRF-RRRF if orientation is correct + my ($singleBreakpoint, %badInFRSense) = findBadInFRSenseSOLiDSolexa(\@strand1,\@strand2,\@ends_order1,\@ends_order2,\@order1,\@order2,$mate_sense); + #find pairs type F-RRRR or FFFF-R in the case if orientation is correct + #These pairs are annotated as BED pairs forever! They won't be recycled! + my $table; + for my $i (0..$numberOfPairs-1) { #fill the table with non adequate pairs: pairID numberOfNonAdPairs nonAdPairIDs + my $nonAdeq = 0; + for my $j (0..$i-1) { + if (exists($table->{$j}->{$i})) { + $nonAdeq++; + $table->{$i}->{$j} = 1; + } + } + for my $j ($i+1..$numberOfPairs-1) { + if ($positions1[$j]-$positions1[$i]>$alphaDist) { + if (&reversed ($i,$j,$ifRenv,\@positions2)) { + $nonAdeq++; + $table->{$i}->{$j} = 1; + } + } + } + $table->{$i}->{nonAdeq} = $nonAdeq; + } + + for my $bad (keys %badInFRSense) { #remove pairs type F-RRRR or FFFF-R in the case of orientation + &remove($bad,$table); + } + + my @falseReads; + #RRRR-F -> RRRR or R-FFFF -> FFFF + @falseReads = findBadInRFSenseSOLiDSolexa(\@strand1,\@ends_order1,$mate_sense, keys %{$table}); + #these pairs will be recycled later as $secondTable + for my $bad (@falseReads) { + &remove($bad,$table); + } + + my $bad = &check($table); + while ($bad ne "OK") { #clear the table to reject non adequate pairs in the sense of ORDER + # push (@falseReads, $bad); remove completely!!! + &remove($bad,$table); + $bad = &check($table); + } + + $reads_left = scalar keys %{$table}; + my $coord_start_chr1_cluster1 = min(min(@positions1[sort {$a<=>$b} keys %{$table}]),min(@ends1[sort {$a<=>$b} keys %{$table}])); + my $coord_end_chr1_cluster1 = max(max(@positions1[sort {$a<=>$b} keys %{$table}]),max(@ends1[sort {$a<=>$b} keys %{$table}])); + my $coord_start_chr2_cluster1 = min(min(@positions2[sort {$a<=>$b} keys %{$table}]),min(@ends2[sort {$a<=>$b} keys %{$table}])); + my $coord_end_chr2_cluster1 = max(max(@positions2[sort {$a<=>$b} keys %{$table}]),max(@ends2[sort {$a<=>$b} keys %{$table}])); + + $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')'; + $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')'; + + my $ifBalanced = 'UNBAL'; + my $secondTable; + my $clusterCoordinates; + my ($break_pont_chr1,$break_pont_chr2); + + my $signatureType=""; + + my $maxCoord1 =$chr->{$chr1}->{length}; + my $maxCoord2 =$chr->{$chr2}->{length}; + + if (scalar @falseReads) { + @falseReads = sort @falseReads; + #now delete FRFR choosing the majority + my @newfalseReads; #find and remove pairs type RRRR-F or R-FFFF + @newfalseReads = findBadInRFSenseSOLiDSolexa(\@strand1,\@ends_order1,$mate_sense,@falseReads); #these @newfalseReads won't be recycled + my %hashTmp; + for my $count1 (0..scalar(@falseReads)-1) { + my $i = $falseReads[$count1]; + $hashTmp{$i} = 1; + for my $bad (@newfalseReads) { + if ($bad == $i) { + delete $hashTmp{$i}; + next; + } + } + } + @falseReads = sort keys %hashTmp; #what is left + for my $count1 (0..scalar(@falseReads)-1) { #fill the table for reads which were previously rejected + my $nonAdeq = 0; + my $i = $falseReads[$count1]; + + for my $count2 (0..$count1-1) { + my $j = $falseReads[$count2]; + if (exists($secondTable->{$j}->{$i})) { + $nonAdeq++; + $secondTable->{$i}->{$j} = 1; + } + } + for my $count2 ($count1+1..scalar(@falseReads)-1) { + my $j = $falseReads[$count2]; + if ($positions1[$j]-$positions1[$i]>$alphaDist) { + if (&reversed ($i,$j,$ifRenv,\@positions2)) { + $nonAdeq++; + $secondTable->{$i}->{$j} = 1; + } + } + } + $secondTable->{$i}->{nonAdeq} = $nonAdeq; + } + + my @falseReads2; + my $bad = &check($secondTable); + while ($bad ne "OK") { #clear the table to reject non adequate pairs + push (@falseReads2, $bad); + &remove($bad,$secondTable); + $bad = &check($secondTable); + } + if (scalar keys %{$secondTable} >= $nb_pairs_order_threshold) { + my $coord_start_chr1_cluster2 = min(min(@positions1[sort {$a<=>$b} keys %{$secondTable}]),min(@ends1[sort {$a<=>$b} keys %{$secondTable}])); + my $coord_end_chr1_cluster2 = max(max(@positions1[sort {$a<=>$b} keys %{$secondTable}]),max(@ends1[sort {$a<=>$b} keys %{$secondTable}])); + my $coord_start_chr2_cluster2 = min(min(@positions2[sort {$a<=>$b} keys %{$secondTable}]),min(@ends2[sort {$a<=>$b} keys %{$secondTable}])); + my $coord_end_chr2_cluster2 = max(max(@positions2[sort {$a<=>$b} keys %{$secondTable}]),max(@ends2[sort {$a<=>$b} keys %{$secondTable}])); + + $ifBalanced = 'BAL'; + + if ($ifBalanced eq 'BAL') { + + if (scalar keys %{$table} < $nb_pairs_order_threshold) { + $ifBalanced = 'UNBAL'; #kill cluster 1! + ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2 + $reads_left = scalar keys %{$table}; + $coord_start_chr1_cluster1 = $coord_start_chr1_cluster2; + $coord_end_chr1_cluster1 = $coord_end_chr1_cluster2; + $coord_start_chr2_cluster1 = $coord_start_chr2_cluster2; + $coord_end_chr2_cluster1 = $coord_end_chr2_cluster2; + $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')'; + $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')'; + + } else { + + $reads_left += scalar keys %{$secondTable}; + next if ($reads_left < $nb_pairs_threshold); + + if ($coord_end_chr1_cluster2 < $coord_start_chr1_cluster1) { + ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2 + + ($coord_start_chr1_cluster1,$coord_start_chr1_cluster2) = ($coord_start_chr1_cluster2,$coord_start_chr1_cluster1); + ($coord_end_chr1_cluster1,$coord_end_chr1_cluster2)=($coord_end_chr1_cluster2,$coord_end_chr1_cluster1); + ($coord_start_chr2_cluster1,$coord_start_chr2_cluster2)=($coord_start_chr2_cluster2,$coord_start_chr2_cluster1); + ($coord_end_chr2_cluster1 , $coord_end_chr2_cluster2)=($coord_end_chr2_cluster2 , $coord_end_chr2_cluster1); + + $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.'),'.$clusterCoordinates_chr1; + $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.'),'.$clusterCoordinates_chr2; + }else { + $clusterCoordinates_chr1 .= ',('.$coord_start_chr1_cluster2.','.$coord_end_chr1_cluster2.')'; + $clusterCoordinates_chr2 .= ',('.$coord_start_chr2_cluster2.','.$coord_end_chr2_cluster2.')'; + } + $coord_start_chr1 = min($coord_start_chr1_cluster1,$coord_start_chr1_cluster2); + $coord_end_chr1 = max($coord_end_chr1_cluster1,$coord_end_chr1_cluster2); + $coord_start_chr2 = min($coord_start_chr2_cluster1,$coord_start_chr2_cluster2); + $coord_end_chr2 = max($coord_end_chr2_cluster1,$coord_end_chr2_cluster2); + #to calculate breakpoints one need to take into account read orientation in claster.. + my $leftLetterOk = substr($mate_sense, 0, 1); #R + my $rightLetterOk = substr($mate_sense, 1, 1); #F + + + my @index1 = keys %{$table}; + my @index2 = keys %{$secondTable}; + + my (@generalStrand1,@generalStrand2) = 0; + + if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs + $leftLetterOk = 'R'; + $rightLetterOk = 'F'; + @generalStrand1 = translateSolidToRF(\@strand1,\@ends_order1); + @generalStrand2 = translateSolidToRF(\@strand2,\@ends_order2); + } else { + @generalStrand1 = @strand1; + @generalStrand2 = @strand2; # TODO check if it is correct + } + if ($generalStrand1[$index1[0]] eq $leftLetterOk && $generalStrand1[$index2[0]] eq $rightLetterOk) { #(R,F) + $break_pont_chr1 = '('.$coord_end_chr1_cluster1.','.$coord_start_chr1_cluster2.')'; + + if ($generalStrand2[$index1[0]] eq $rightLetterOk && $generalStrand2[$index2[0]] eq $leftLetterOk) { + if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { + $break_pont_chr2 = '('.$coord_end_chr2_cluster2.','.$coord_start_chr2_cluster1.')'; + $signatureType = "TRANSLOC"; + } else { + $break_pont_chr2 = '('.max(($coord_end_chr2_cluster1-$maxFragmentLength),1).','.$coord_start_chr2_cluster1.')'; + $break_pont_chr2 .= ',('.$coord_end_chr2_cluster2.','.min(($coord_start_chr2_cluster2+$maxFragmentLength),$maxCoord2).')'; + $signatureType = "INS_FRAGMT"; + } + + } elsif ($generalStrand2[$index1[0]] eq $leftLetterOk && $generalStrand2[$index2[0]] eq $rightLetterOk) { + if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { + $break_pont_chr2 = '('.max(($coord_end_chr2_cluster2-$maxFragmentLength),1).','.$coord_start_chr2_cluster2.')'; + $break_pont_chr2 .= ',('.$coord_end_chr2_cluster1.','.min(($coord_start_chr2_cluster1+$maxFragmentLength),$maxCoord2).')'; + $signatureType = "INV_INS_FRAGMT"; + } else { + $break_pont_chr2 = '('.$coord_end_chr2_cluster1.','.$coord_start_chr2_cluster2.')'; + $signatureType = "INV_TRANSLOC"; + } + } else { + #should not occur + print STDERR "\nError in orderFiltering\n\n"; + } + } + + elsif ($generalStrand1[$index1[0]] eq $rightLetterOk && $generalStrand1[$index2[0]] eq $leftLetterOk) { #(F,R) + $break_pont_chr1 = '('.max(($coord_end_chr1_cluster1-$maxFragmentLength),1).','.$coord_start_chr1_cluster1.')'; + $break_pont_chr1 .= ',('.$coord_end_chr1_cluster2.','.min(($coord_start_chr1_cluster2+$maxFragmentLength),$maxCoord1).')'; + if ($generalStrand2[$index1[0]] eq $rightLetterOk && $generalStrand2[$index2[0]] eq $leftLetterOk) { + if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { + $break_pont_chr2 = '('.$coord_end_chr2_cluster2.','.$coord_start_chr2_cluster1.')'; + $signatureType = "INV_INS_FRAGMT"; + } else { + $break_pont_chr2 = '('.max(($coord_end_chr2_cluster1-$maxFragmentLength),1).','.$coord_start_chr2_cluster1.')'; + $break_pont_chr2 .= ',('.$coord_end_chr2_cluster2.','.min(($coord_start_chr2_cluster2+$maxFragmentLength),$maxCoord2).')'; + $signatureType = "INV_COAMPLICON"; + } + + } elsif ($generalStrand2[$index1[0]] eq $leftLetterOk && $generalStrand2[$index2[0]] eq $rightLetterOk) { + if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { + $break_pont_chr2 = '('.max(($coord_end_chr2_cluster2-$maxFragmentLength),1).','.$coord_start_chr2_cluster2.')'; + $break_pont_chr2 .= ',('.$coord_end_chr2_cluster1.','.min(($coord_start_chr2_cluster1+$maxFragmentLength),$maxCoord2).')'; + $signatureType = "COAMPLICON"; + } else { + $break_pont_chr2 = '('.$coord_end_chr2_cluster1.','.$coord_start_chr2_cluster2.')'; + $signatureType = "INS_FRAGMT"; + } + } else { + #should not occur + $signatureType = "UNDEFINED"; + } + } + else { # (F,F) or (R,R) something strange. We will discard the smallest cluster + $ifBalanced = 'UNBAL'; + if (scalar keys %{$secondTable} > scalar keys %{$table}) { + ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2 + + $coord_start_chr1_cluster1 = $coord_start_chr1_cluster2; + $coord_end_chr1_cluster1 = $coord_end_chr1_cluster2; + $coord_start_chr2_cluster1 = $coord_start_chr2_cluster2; + $coord_end_chr2_cluster1 = $coord_end_chr2_cluster2; + $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')'; + $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')'; + } + $reads_left = scalar keys %{$table}; + } + if ($ifBalanced eq 'BAL') { + $ifRenv = $signatureType; + } + } + } + } + } + if ($ifBalanced ne 'BAL') { + #define possible break point + $coord_start_chr1 = $coord_start_chr1_cluster1; + $coord_end_chr1 = $coord_end_chr1_cluster1; + $coord_start_chr2 = $coord_start_chr2_cluster1; + $coord_end_chr2 = $coord_end_chr2_cluster1; + + my $region_length_chr1 = $coord_end_chr1-$coord_start_chr1; + my $region_length_chr2 = $coord_end_chr2-$coord_start_chr2; + + my $leftLetterOk = substr($mate_sense, 0, 1); #R + my $rightLetterOk = substr($mate_sense, 1, 1); #F + + my @index = keys %{$table}; + unless ($diff_sense_ends) { + my $firstEndOrder1 = $ends_order1[$index[0]]; + my $firstEndOrder2 = $ends_order2[$index[0]]; + $break_pont_chr1 = (($strand1[$index[0]] eq 'R' && $firstEndOrder1 == 2) || ($strand1[$index[0]] eq 'F' && $firstEndOrder1 == 1))?'('.$coord_end_chr1.','.min(($coord_start_chr1+$maxFragmentLength),$maxCoord1).')':'('.max(($coord_end_chr1-$maxFragmentLength),1).','.$coord_start_chr1.')'; + $break_pont_chr2 = (($strand2[$index[0]] eq 'R' && $firstEndOrder2 == 2) || ($strand2[$index[0]] eq 'F' && $firstEndOrder2 == 1))?'('.$coord_end_chr2.','.min(($coord_start_chr2+$maxFragmentLength),$maxCoord2).')':'('.max(($coord_end_chr2-$maxFragmentLength),1).','.$coord_start_chr2.')'; + } else { + $break_pont_chr1 = ($strand1[$index[0]] eq $leftLetterOk )?'('.$coord_end_chr1.','.min(($coord_start_chr1+$maxFragmentLength),$maxCoord1).')':'('.max(($coord_end_chr1-$maxFragmentLength),1).','.$coord_start_chr1.')'; + $break_pont_chr2 = ($strand2[$index[0]] eq $leftLetterOk )?'('.$coord_end_chr2.','.min(($coord_start_chr2+$maxFragmentLength),$maxCoord2).')':'('.max(($coord_end_chr2-$maxFragmentLength),1).','.$coord_start_chr2.')'; + } + + if ($chr1 ne $chr2){ + $ifRenv="INV_TRANSLOC" if($ifRenv eq "REVERSE_SENSE"); + $ifRenv="TRANSLOC" if($ifRenv eq "NORMAL_SENSE"); + } + } + + if (($ifBalanced eq 'BAL')&&( (scalar keys %{$table}) + (scalar keys %{$secondTable}) < $nb_pairs_threshold)) { + next; #discard the link + } + if (($ifBalanced eq 'UNBAL')&&(scalar keys %{$table} < $nb_pairs_threshold)) { + next; #discard the link + } + my $ratioTxt = "$reads_left/".(scalar @pairs); + my ($n1,$nTot) = split ("/",$strand_ratio_filtering); + my $ratioReal = $reads_left/$nTot; + + if ($coord_start_chr1<=0) { + $coord_start_chr1=1; + } + if ($coord_start_chr2<=0) { + $coord_start_chr2=1; + } + #create output + my @link=($chr->{$chr1}->{name}, $coord_start_chr1 , $coord_end_chr1, #all information output + $chr->{$chr2}->{name}, $coord_start_chr2 , $coord_end_chr2, + $reads_left, + &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@pairs), + &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@strand1), + &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@strand2), + &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@ends_order1), + &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@ends_order2), + &redraw(2,$table,$secondTable,\%badInFRSense,$ifBalanced,\@order1), + &redraw(2,$table,$secondTable,\%badInFRSense,$ifBalanced,\@order2), + &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@positions1), + &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@positions2), + $ifRenv, + $strand_ratio_filtering, + $ifBalanced, $ratioTxt, $break_pont_chr1, $break_pont_chr2, + $ratioReal, $nTot); + + $nb_links++; + print $fh2 join("\t",@link)."\n"; + + if($record>=$warn){ + print LOG "-- $fchr : $warn links analysed - $nb_links links kept\n"; + $warn+=10000; + } + + } + $fh->close; + $fh2->close; + + print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n"; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#gets information about ends positions given start, direction and order +sub getEnds { + my ($ends,$starts,$strand,$end_order,$tag_length) = @_; + for my $i (0..scalar(@{$starts})-1) { + $ends->[$i] = getEnd($starts->[$i],$strand->[$i],$end_order->[$i],$tag_length); + } +} +sub getEnd { + my ($start,$strand, $end_order,$tag_length) = @_; + return ($strand eq 'F')? $start+$tag_length->{$end_order}-1:$start-$tag_length->{$end_order}+1; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#gets starts and ends Coords when start=leftmost given positions, directions and orders +sub getCoordswithLeftMost { + + my ($starts,$ends,$positions,$strand,$end_order,$tag_length) = @_; + + for my $i (0..scalar(@{$positions})-1) { + + if($strand->[$i] eq 'F'){ + $starts->[$i]=$positions->[$i]; + $ends->[$i]=$positions->[$i]+$tag_length->{$end_order->[$i]}-1; + }else{ + $starts->[$i]=$positions->[$i]-$tag_length->{$end_order->[$i]}+1; + $ends->[$i]=$positions->[$i]; + } + } +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub addInsertionInfo { #add field with INS,DEL,NA and distance between clusters and performs filtering + + my ($chr,$chrID,$nb_pairs_threshold,$order_filtering,$indel_sigma_threshold,$dup_sigma_threshold,$singleton_sigma_threshold,$mu,$sigma,$mate_sense,$tag_length,$links_file,$flinks_file)=@_; + + my @sfile=split(/\./,$links_file); + my $fchr=$sfile[$#sfile-2]; + + + my $diff_sense_ends=(($mate_sense eq "FR") || ($mate_sense eq "RF"))? 1:0; + + my $record=0; + my $nb_links=0; + my $warn=10000; + + print LOG "\# Filtering out normal pairs using insert size...\n"; + print LOG "-- mu length=$mu, sigma length=$sigma, indel sigma threshold=$indel_sigma_threshold, dup sigma threshold=$dup_sigma_threshold\n"; + print LOG "-- using ".($mu-$indel_sigma_threshold*$sigma)."-". + ($mu+$indel_sigma_threshold*$sigma)." as normal range of insert size for indels\n"; + print LOG "-- using ".($mu-$dup_sigma_threshold*$sigma)."-". + ($mu+$dup_sigma_threshold*$sigma)." as normal range of insert size for duplications\n"; + print LOG "-- using ".($mu-$singleton_sigma_threshold*$sigma)." as the upper limit of insert size for singletons\n" if($mate_sense eq "RF"); + + my $fh = new FileHandle; + my $fh2 = new FileHandle; + + $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; + $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n"; + + while(<$fh>){ + + $record++; + my @t = split; + my ($chr1,$chr2,$mates_list)=@t[0,3,7]; + + if($chrID->{$chr1} ne $chrID->{$chr2}) { #if inter-chromosomal link here (because sv_type=all), + $nb_links++; + + $t[16]="INV_TRANSLOC" if($t[16] eq "REVERSE_SENSE"); + $t[16]="TRANSLOC" if($t[16] eq "NORMAL_SENSE"); + + $t[16].= "\t"; + $t[19].= "\t"; + + print $fh2 join("\t",@t)."\n"; + + if($record>=$warn){ + print LOG "-- $fchr : $warn links processed - $nb_links links kept\n"; + $warn+=10000; + } + next; + } + + my $ifRenv = $t[16]; + my $ifBalanced = "UNBAL"; + $ifBalanced = $t[18] if ($order_filtering); + + my $numberOfPairs = $t[6]; + my @positions1 = deleteBadOrderSensePairs(split (/,/,$t[14])); + my @positions2 = deleteBadOrderSensePairs(split (/,/,$t[15])); + + if ($ifBalanced eq "BAL") { + + if ($ifRenv eq "INV_TRANSLOC") { + $ifRenv = "INV_FRAGMT"; #for intrachromosomal inverted translocation is the same as inverted fragment + } + if ($ifRenv eq "NORMAL_SENSE") { + $ifRenv = "TRANSLOC"; + } + if ($ifRenv eq "REVERSE_SENSE") { + $ifRenv = "INV_FRAGMT"; #for intrachromosomal inverted translocation is the same as inverted fragment + } + $t[19].= "\t"; + + my $meanDistance = 0; + + for my $i (0..$numberOfPairs-1) { + $meanDistance += $positions2[$i]-$positions1[$i]; + } + $meanDistance /= $numberOfPairs; + + $t[16] = $ifRenv."\t".$meanDistance; + #dont touch the annotation. It should be already OK. + + } else { + #only for unbalanced + + my $ifoverlap=overlap($t[1],$t[2],$t[4],$t[5]); + + my $ends_sense_class = (deleteBadOrderSensePairs(split (/,/,$t[8])))[0]. + (deleteBadOrderSensePairs(split (/,/,$t[9])))[0]; + my $ends_order_class = (deleteBadOrderSensePairs(split (/,/,$t[10])))[0]. + (deleteBadOrderSensePairs(split (/,/,$t[11])))[0]; + + my $indel_type = $ifRenv; + + my $meanDistance = "N/A"; + + ($meanDistance, $indel_type) = checkIndel ($numberOfPairs, #identify insertion type for rearrangments without inversion, calculates distance between cluster + \@positions1, #assign N/A to $indel_type if unknown + \@positions2, + $ifRenv, + $ifoverlap, + $indel_sigma_threshold, + $dup_sigma_threshold, + $singleton_sigma_threshold, + $mu, + $sigma, + $ifBalanced, + $ends_sense_class, + $ends_order_class, + $mate_sense, + $diff_sense_ends, + ); + + #filtering of pairs with distance inconsistant with the SV + if ($ifRenv ne "REVERSE_SENSE") { + my $maxCoord1 =$chr->{$chrID->{$chr1}}->{length}; + my $maxCoord2 =$chr->{$chrID->{$chr2}}->{length}; + $meanDistance = recalc_t_usingInsertSizeInfo(\@t,$mu,$sigma,$meanDistance,$tag_length,$diff_sense_ends,$mate_sense, + $maxCoord1,$maxCoord2,$ends_sense_class,$ends_order_class,$nb_pairs_threshold,$order_filtering); + next if ($t[6] < $nb_pairs_threshold); + }else{ + $t[19].= "\t"; + } + $t[16] = $indel_type."\t".$meanDistance; + } + + $nb_links++; + + print $fh2 join("\t",@t)."\n"; + if($record>=$warn){ + print LOG "-- $fchr : $warn links processed - $nb_links links kept\n"; + $warn+=10000; + } + } + $fh->close; + $fh2->close; + + print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n"; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub checkIndel { + + my ($numberOfPairs, $positions1, $positions2, $ifRenv, $ifoverlap, $indel_sigma_threshold, $dup_sigma_threshold, $singleton_sigma_threshold, + $mu, $sigma, $ifBalanced,$ends_sense_class,$ends_order_class,$mate_sense,$diff_sense_ends) = @_; + + my $meanDistance = 0; + + for my $i (0..$numberOfPairs-1) { + $meanDistance += $positions2->[$i]-$positions1->[$i]; + } + $meanDistance /= $numberOfPairs; + + return ($meanDistance,"INV_DUPLI") if (($ifRenv eq "REVERSE_SENSE") && ($meanDistance<$mu+$dup_sigma_threshold*$sigma) ); + + return ($meanDistance,"INVERSION") if ($ifRenv eq "REVERSE_SENSE"); + + if($diff_sense_ends){ + return ($meanDistance, "LARGE_DUPLI") if ($ends_sense_class ne $mate_sense) && ($meanDistance>$mu+$dup_sigma_threshold*$sigma) ; + return ($meanDistance, "SINGLETON") if (($meanDistance<$mu-$singleton_sigma_threshold*$sigma) && $mate_sense eq "RF" && ($ends_sense_class eq inverseSense($mate_sense))); + }else{ + return ($meanDistance, "LARGE_DUPLI") if (($ends_sense_class eq $mate_sense) && ($ends_order_class eq "12") || ($ends_sense_class eq inverseSense($mate_sense)) && ($ends_order_class eq "21")) && + ($meanDistance>$mu+$dup_sigma_threshold*$sigma) ; + } + + return ($meanDistance, "SMALL_DUPLI") if (($meanDistance<$mu-$dup_sigma_threshold*$sigma) && $ifoverlap); + + return ($meanDistance, "DUPLICATION") if ($diff_sense_ends && ($ends_sense_class ne $mate_sense) && ($meanDistance<$mu-$dup_sigma_threshold*$sigma) ) ; + + return ($meanDistance, "INSERTION") if ($meanDistance<$mu -$indel_sigma_threshold*$sigma); + return ($meanDistance, "DELETION") if ($meanDistance>$mu+$indel_sigma_threshold*$sigma); + + return ($meanDistance, "UNDEFINED"); +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#sub reacalulate @t so that get rid of unconsistent pairs (unconsistent insert size ) +sub recalc_t_usingInsertSizeInfo { + my($t,$mu,$sigma,$meanDistance,$tag_length,$diff_sense_ends,$mate_sense,$maxCoord1,$maxCoord2,$ends_sense_class,$ends_order_class,$nb_pairs_threshold,$order_filtering) = @_; + + my @badPairs; + + my @positions1 = getAllEntries($t->[14]); + my @positions2 = getAllEntries($t->[15]); + + if ($meanDistance < $mu) { + for my $i (0..scalar(@positions1)-1) { + if (substr($positions2[$i],-1,1) ne '$' && substr($positions2[$i],-1,1) ne '*' && $positions2[$i]-$positions1[$i]>=$mu) { + push(@badPairs,$i); + } + } + } else { + for my $i (0..scalar(@positions1)-1) { + if (substr($positions2[$i],-1,1) ne '$' && substr($positions2[$i],-1,1) ne '*' && $positions2[$i]-$positions1[$i]<=$mu) { + push(@badPairs,$i); + } + } + } + + if (scalar (@badPairs)>0) { + #print join("\t",@badPairs).": ".join("\t",@t)."\n"; + #remove these inconsistant links + $t->[6] -= scalar(@badPairs); #numberOfPairs + return if ($t->[6] < $nb_pairs_threshold); + + $t->[7] = mark_values(\@badPairs, $t->[7]); + $t->[8] = mark_values(\@badPairs, $t->[8]); + $t->[9] = mark_values(\@badPairs, $t->[9]); + $t->[10] = mark_values(\@badPairs, $t->[10]); + $t->[11] = mark_values(\@badPairs, $t->[11]); + + $t->[12] = mark_indexes(\@badPairs, $t->[12]); + $t->[13] = mark_indexes(\@badPairs, $t->[13]); + + $t->[14] = mark_values(\@badPairs, $t->[14]); + $t->[15] = mark_values(\@badPairs, $t->[15]); + $t->[19] = recalculate_ratio($t->[6],$t->[19]) if ($order_filtering); #add the second ratio + $t->[17] = recalculate_ratio($t->[6],$t->[17]) unless ($order_filtering); + ($t->[1],$t->[2]) = recalculate_boundaries($t->[14],$t->[8],$t->[10],$tag_length); + ($t->[4],$t->[5]) = recalculate_boundaries($t->[15],$t->[9],$t->[11],$tag_length); + + #recalc breakpoints: + my $quant001 = 3.090232; + my $maxFragmentLength = &floor($quant001 * $sigma + $mu); + $t->[20] = recalc_breakpoints($mate_sense,$maxCoord1,$t->[14],substr($ends_sense_class,0,1),substr($ends_order_class,0,1),$t->[1],$t->[2],$maxFragmentLength,$diff_sense_ends ) if ($order_filtering); + $t->[21] = recalc_breakpoints($mate_sense,$maxCoord2,$t->[15],substr($ends_sense_class,1,1),substr($ends_order_class,1,1),$t->[4],$t->[5],$maxFragmentLength,$diff_sense_ends ) if ($order_filtering); + #recalc total ratio + $t->[22] = $t->[6] / $t->[23] if ($order_filtering); + $t->[18] = $t->[6] / $t->[19] unless ($order_filtering); + + @positions1 = deleteBadOrderSensePairs(split (/,/,$t->[14])); + @positions2 = deleteBadOrderSensePairs(split (/,/,$t->[15])); + + $meanDistance = 0; + + for my $i (0..scalar(@positions1)-1) { + $meanDistance += $positions2[$i]-$positions1[$i]; + } + $meanDistance /= scalar(@positions1); + + } else { + $t->[17] = recalculate_ratio((split(/\//,$t->[17]))[0],$t->[17]) unless ($order_filtering); + $t->[19] = recalculate_ratio((split(/\//,$t->[19]))[0],$t->[19]) if ($order_filtering); + + } #nothing has been filtered + return $meanDistance; +} + +sub recalculate_ratio { + my ($left, $ratio) = @_; + my @elements = split (/\//,$ratio); + $elements[1]= $elements[0]; + $elements[0]=$left; + return $ratio."\t".join("/",@elements); +} + +sub recalc_breakpoints { + my ($mate_sense,$maxCoord,$startString,$strand,$firstEndOrder,$coord_start_chr,$coord_end_chr,$maxFragmentLength,$diff_sense_ends ) = @_; + my $break_pont_chr; + + my $leftLetterOk = substr($mate_sense, 0, 1); #R + my $rightLetterOk = substr($mate_sense, 1, 1); #F + + + my @positions = deleteBadOrderSensePairs(split (/,/,$startString)); + + unless ($diff_sense_ends) { + $break_pont_chr = (($strand eq 'R' && $firstEndOrder == 2) || ($strand eq 'F' && $firstEndOrder == 1))?'('.$coord_end_chr.','.min(($coord_start_chr+$maxFragmentLength),$maxCoord).')':'('.max(($coord_end_chr-$maxFragmentLength),1).','.$coord_start_chr.')'; + } else { + $break_pont_chr = ($strand eq $leftLetterOk)?'('.$coord_end_chr.','.min(($coord_start_chr+$maxFragmentLength),$maxCoord).')':'('.max(($coord_end_chr-$maxFragmentLength),1).','.$coord_start_chr.')'; + } + return $break_pont_chr; +} +sub recalculate_boundaries { + my ($startString,$senseString,$endsOrderString,$tag_length) = @_; + my @positions = deleteBadOrderSensePairs(split (/,/,$startString)); + my @strands = deleteBadOrderSensePairs(split (/,/,$senseString)); + my @ends_orders = deleteBadOrderSensePairs(split (/,/,$endsOrderString)); + my @ends; getEnds(\@ends,\@positions,\@strands,\@ends_orders,$tag_length); + my $coord_start_cluster = min(min(@positions),min(@ends)); + my $coord_end_cluster = max(max(@positions),max(@ends)); + return ($coord_start_cluster,$coord_end_cluster); +} + +sub remove_indexes { + my ($bads, $string) = @_; + my @elements = deleteBadOrderSensePairs(split (/,/,$string)); + for my $i (reverse sort %{$bads}) { + delete $elements[$i]; + } + return "(".join(",",@elements).")"; +} +##add @ to to elements +sub mark_values { + my ($bads, $string) = @_; + my @elements = getAllEntries($string); + for my $i (@{$bads}) { + $elements[$i] .= "@"; + } + return "(".join(",",@elements).")"; +} +##add @ to to indexes +sub mark_indexes { + my ($bads, $string) = @_; + my @elements = getAllEntries($string); + for my $i ((0..scalar(@elements)-1)) { + for my $j (@{$bads}) { + $elements[$i] .= "@" if ($elements[$i] eq ($j+1)); + } + } + + return "(".join(",",@elements).")"; +} + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub redraw { + + my ($type,$table,$secondTable,$badInFRSense,$ifBalanced,$arr) = @_; + + my $out; + my @first_arr; + if ($ifBalanced eq 'BAL') { + my @second_arr; + my $lastPushed = 1; + if ($type == 1) { + for my $i (0 .. scalar(@{$arr})-1) { + if (exists ($table->{$i})) { + push(@first_arr,$arr->[$i]); + $lastPushed = 1; + }elsif (exists ($secondTable->{$i})) { + push(@second_arr,$arr->[$i]); + $lastPushed = 2; + } elsif ($lastPushed == 1) { + if (exists ($badInFRSense->{$i})) { + push(@first_arr,$arr->[$i]."\$"); + }else { + push(@first_arr,$arr->[$i]."*"); + } + } elsif ($lastPushed == 2) { + if (exists ($badInFRSense->{$i})) { + push(@second_arr,$arr->[$i]."\$"); + }else { + push(@second_arr,$arr->[$i]."*"); + } + } else {print "Error!";exit;} + } + } else { + for my $i (@{$arr}) { + if (exists ($table->{$i-1})) { + push(@first_arr,$i); + $lastPushed = 1; + }elsif (exists ($secondTable->{$i-1})) { + push(@second_arr,$i); + $lastPushed = 2; + } elsif ($lastPushed == 1) { + if (exists ($badInFRSense->{$i-1})) { + push(@first_arr,$i."\$"); + }else { + push(@first_arr,$i."*"); + } + } elsif ($lastPushed == 2) { + if (exists ($badInFRSense->{$i-1})) { + push(@second_arr,$i."\$"); + }else { + push(@second_arr,$i."*"); + } + } else {print "Error!";exit;} + } + } + $out = '('.join(",",@first_arr).'),('.join(",",@second_arr).')'; + } + else { + if ($type == 1) { + for my $i (0 .. scalar(@{$arr})-1) { + if (exists ($table->{$i})) { + push(@first_arr,$arr->[$i]); + } else { + if (exists ($badInFRSense->{$i})) { + push(@first_arr,$arr->[$i]."\$"); + }else { + push(@first_arr,$arr->[$i]."*"); + } + } + } + } else { + for my $i (@{$arr}) { + if (exists ($table->{$i-1})) { + push(@first_arr,$i); + } else { + if (exists ($badInFRSense->{$i-1})) { + push(@first_arr,$i."\$"); + }else { + push(@first_arr,$i."*"); + } + } + } + } + $out = '('.join(",",@first_arr).')'; + } + return $out; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub check { + + my $table = $_[0]; + my $bad = 'OK'; + my $max = 0; + for my $i (sort {$a<=>$b} keys %{$table}) { + unless ($table->{$i}->{nonAdeq} == 0) { + if ($max<$table->{$i}->{nonAdeq}) { + $max=$table->{$i}->{nonAdeq}; + $bad = $i; + } + } + } + return $bad; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub reversed { + + my ($i,$j,$ifRenv,$positions) = @_; + if (($ifRenv eq 'REVERSE_SENSE' && $positions->[$i]<$positions->[$j]) || ($ifRenv ne 'REVERSE_SENSE' && $positions->[$i]>$positions->[$j])){ + return 1; + } + return 0; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub remove { + + my ($bad,$table) = @_; + for my $i (sort {$a<=>$b} keys %{$table}) { + if ($bad == $i) { + delete($table->{$i});; + } else { + if (exists($table->{$i}->{$bad})) { + delete($table->{$i}->{$bad}); + $table->{$i}->{nonAdeq}--; + } + } + } +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub findBadInRFSenseSOLiDSolexa { #choose maximum: FFFFs or RRRRs + + my ($strand,$ends_order,$mate_sense,@keysLeft) = @_; + + my $leftLetterOk = substr($mate_sense, 0, 1); #R + my $rightLetterOk = substr($mate_sense, 1, 1); #F + + my (@standardArray); + if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs + $leftLetterOk = 'R'; + $rightLetterOk = 'F'; + @standardArray = translateSolidToRF($strand,$ends_order); + } else { + @standardArray = @{$strand}; + } + + my $ifR = 0; + my @Rs; + + for my $i (@keysLeft) { + if ($standardArray[$i] eq $leftLetterOk) { + $ifR++; + push(@Rs,$i); + } + } + + + my $ifF = 0; + my @Fs; + + for my $i (@keysLeft) { + if ($standardArray[$i] eq $rightLetterOk) { + $ifF++; + push(@Fs,$i); + } + } + + if($ifR>=$ifF) { + return @Fs; + } + return @Rs; +} + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub findBadInFRSenseSOLiDSolexa { #should work both for SOLiD and Solexa + + my ($strand1,$strand2,$ends_order1,$ends_order2,$order1,$order2) = ($_[0],$_[1],$_[2],$_[3],$_[4],$_[5]); + my $mate_sense = $_[6]; + + my $leftLetterOk = substr($mate_sense, 0, 1); #R + my $rightLetterOk = substr($mate_sense, 1, 1); #F + + my (@standardArray1,@standardArray2); + + if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs + $leftLetterOk = 'R'; + $rightLetterOk = 'F'; + @standardArray1 = translateSolidToRF($strand1,$ends_order1); + my @arr = getOrderedStrands($strand2,$order2); + my @ends2 = getOrderedStrands($ends_order2,$order2); + @standardArray2 = translateSolidToRF(\@arr,\@ends2); + + } else { + @standardArray1 = @{$strand1}; + @standardArray2 = getOrderedStrands($strand2,$order2); + } + + #we will try 4 possibilities, 2 for each end of the link: RFRR-FFF->RFFFF , RFRR-FFF->RRRFFF + + #for the first end: + + my @array = @standardArray1; + my %badInFRSense1; + for my $i (1..scalar (@array)-1){ # FRFRFFFF -> FFFFFF and RRFRFRFFFF -> RRFFFFFF + if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { + $badInFRSense1{$i}=1; + $array[$i] = $rightLetterOk; + } + } + my $numberRRRFFF_or_FFF_1 = scalar(@array)-scalar(keys %badInFRSense1); + @array = @standardArray1; + my %badInFRSense0; + for my $i (reverse(1..scalar (@array)-1)){ # FRFRFFFFRR -> FFFFFFRR + if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { + $badInFRSense0{$i-1}=1; + $array[$i-1] = $leftLetterOk; + + } + } + my $numberRRF1 = scalar(@array)-scalar(keys %badInFRSense0); + + #for the second end: + @array = @standardArray2; + + my %badInFRSense3; + for my $i (1..scalar(@array)-1){ + if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { + $badInFRSense3{$order2->[$i]}=1; + $array[$i] = $rightLetterOk; + } + } + my $numberRRRFFF_or_FFF_2 = scalar(@array)-scalar(keys %badInFRSense3); + + @array = @standardArray2; + my %badInFRSense5; + for my $i (reverse(1..scalar (@array)-1)){ # FRFRFFFF -> FFFFFF + if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { + $badInFRSense5{$i-1}=1; + $array[$i-1] = $leftLetterOk; + } + } + my $numberRRF2 = scalar(@array)-scalar(keys %badInFRSense5); + + if ($numberRRF1>=$numberRRRFFF_or_FFF_1 && $numberRRF1 >= $numberRRRFFF_or_FFF_2 && $numberRRF1 >=$numberRRF2) { + return (1,%badInFRSense0); + } + + if ($numberRRRFFF_or_FFF_1 >=$numberRRF1 && $numberRRRFFF_or_FFF_1 >= $numberRRRFFF_or_FFF_2 && $numberRRRFFF_or_FFF_1 >= $numberRRF2) { + return (1,%badInFRSense1); + } + + if ($numberRRRFFF_or_FFF_2 >= $numberRRF1 && $numberRRRFFF_or_FFF_2 >= $numberRRRFFF_or_FFF_1 && $numberRRRFFF_or_FFF_2 >=$numberRRF2) { + return (2,%badInFRSense3); + } + + if ($numberRRF2 >= $numberRRF1 && $numberRRF2 >= $numberRRRFFF_or_FFF_1 && $numberRRF2 >= $numberRRRFFF_or_FFF_2 ) { + return (2,%badInFRSense5); + } + + #should not get here: + print STDERR "Error in findBadInFRSenseSOLiDSolexa()!\n"; + return (1,%badInFRSense1); +} + +sub getOrderedStrands { + my ($strand,$order) = ($_[0],$_[1]); + my @arr; + for my $i (0..scalar(@{$strand})-1) { + push(@arr,$strand->[$order->[$i]-1]); + } + return @arr; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub checkClusters { + + my ($ifRenv,$coord_start_chr1_cluster1,$coord_start_chr1_cluster2,$coord_start_chr2_cluster1,$coord_start_chr2_cluster2) = @_; + if ($ifRenv eq 'REVERSE_SENSE') { + if ($coord_start_chr1_cluster1 <= $coord_start_chr1_cluster2) { + return ($coord_start_chr2_cluster1 <= $coord_start_chr2_cluster2)?1:0; + } + return ($coord_start_chr2_cluster1 >= $coord_start_chr2_cluster2)?1:0; + } + #if NORM + if ($coord_start_chr1_cluster1 <= $coord_start_chr1_cluster2) { + return ($coord_start_chr2_cluster1 >= $coord_start_chr2_cluster2)?1:0; + } + return ($coord_start_chr2_cluster1 <= $coord_start_chr2_cluster2)?1:0; +} + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub translateSolidToRF { + my ($strandArr,$ends_orderArr)=@_; + my @array; + for my $i (0..scalar(@{$strandArr})-1) { + if ($ends_orderArr->[$i]==1 && $strandArr->[$i] eq 'F') { + push(@array,'F'); + } + if ($ends_orderArr->[$i]==2 && $strandArr->[$i] eq 'F') { + push(@array,'R'); + } + if ($ends_orderArr->[$i]==1 && $strandArr->[$i] eq 'R') { + push(@array,'R'); + } + if ($ends_orderArr->[$i]==2 && $strandArr->[$i] eq 'R') { + push(@array,'F'); + } + } + return @array; +} + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#convert the links file to the circos format +sub links2segdup{ + + my($id,$color_code,$links_file,$segdup_file)=@_; + + print LOG "# Converting to the circos format...\n"; + + tie (my %hcolor,'Tie::IxHash'); #color-code hash table + foreach my $col (keys %{$color_code}){ + my ($min_links,$max_links)=split(",",$color_code->{$col}); + $hcolor{$col}=[$min_links,$max_links]; + } + + open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n"; + open SEGDUP, ">$segdup_file" or die "$0: can't write in the output: $segdup_file :$!\n"; + + my $index=1; + while(){ + + my ($chr1,$start1,$end1,$chr2,$start2,$end2,$count)=(split)[0,1,2,3,4,5,6]; + + my $color=getColor($count,\%hcolor,"circos"); #get the color-code according the number of links + + print SEGDUP "$index\t$id$chr1\t$start1\t$end1\tcolor=$color\n". #circos output + "$index\t$id$chr2\t$start2\t$end2\tcolor=$color\n"; + $index++; + } + + close LINKS; + close SEGDUP; + print LOG "-- output created: $segdup_file\n"; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#convert the links file to the bedPE format for BEDTools usage +sub links2bedPElinksfile{ + + my ($sample,$links_file,$bedpe_file)=@_; + + open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n"; + open BEDPE, ">$bedpe_file" or die "$0: can't write in the output: $bedpe_file :$!\n"; + + my $nb_links=1; + + while(){ + + chomp; + my @t=split("\t",$_); + my ($chr1,$start1,$end1,$chr2,$start2,$end2)=splice(@t,0,6); + my $type=($chr1 eq $chr2)? "INTRA":"INTER"; + $type.="_".$t[10]; + + $start1--; $start2--; + + print BEDPE "$chr1\t$start1\t$end1\t$chr2\t$start2\t$end2". + "\t$sample"."_link$nb_links\t$type\t.\t.". + "\t".join("|",@t)."\n"; + + $nb_links++; + } + + close LINKS; + close BEDPE; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub bedPElinks2linksfile{ + + my ($bedpe_file,$links_file)=@_; + + open BEDPE, "<$bedpe_file" or die "$0: can't open: $bedpe_file :$!\n"; + open LINKS, ">$links_file" or die "$0: can't write in the output $links_file :$!\n"; + + while(){ + + chomp; + my $sample=(split("_",(split("\t",$_))[6]))[0]; + my @t1=(split("\t",$_))[0,1,2,3,4,5]; + my @t2=split(/\|/,(split("\t",$_))[10]); + push(@t2,$sample); + + print LINKS join("\t",@t1)."\t".join("\t",@t2)."\n"; + + } + close BEDPE; + close LINKS; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#convert the links file to the bed format +sub links2bedfile{ + + my ($tag_length,$color_code,$links_file,$bed_file)=@_; + + print LOG "# Converting to the bed format...\n"; + + my $compare=1; + if($links_file!~/compared$/){ + $compare=0; + $tag_length->{none}->{1}=$tag_length->{1}; + $tag_length->{none}->{2}=$tag_length->{2}; + } + + #color-code hash table + tie (my %hcolor,'Tie::IxHash'); + my %color_order; + my $n=1; + foreach my $col (keys %{$color_code}){ + my ($min_links,$max_links)=split(",",$color_code->{$col}); + $hcolor{$col}=[$min_links,$max_links]; + $color_order{$col}=$n; + $n++; + } + + my %pair; + my %pt; + $n=1; + open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n"; + + my %str=( "F"=>"+", "R"=>"-" ); + + while(){ + + my @t=split; + my $sample=($compare)? pop(@t):"none"; + + my $chr1=$t[0]; + my $chr2=$t[3]; + $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i); + $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i); + my $same_chr=($chr1 eq $chr2)? 1:0; + + my $count=$t[6]; + my $color=getColor($count,\%hcolor,"bed"); + + my @pairs=deleteBadOrderSensePairs(split(",",$t[7])); + my @strand1=deleteBadOrderSensePairs(split(",",$t[8])); + my @strand2=deleteBadOrderSensePairs(split(",",$t[9])); + my @ends_order1=deleteBadOrderSensePairs(split(",",$t[10])); + my @ends_order2=deleteBadOrderSensePairs(split(",",$t[11])); + my @position1=deleteBadOrderSensePairs(split(",",$t[14])); + my @position2=deleteBadOrderSensePairs(split(",",$t[15])); + my @start1; my @end1; getCoordswithLeftMost(\@start1,\@end1,\@position1,\@strand1,\@ends_order1,$tag_length->{$sample}); + my @start2; my @end2; getCoordswithLeftMost(\@start2,\@end2,\@position2,\@strand2,\@ends_order2,$tag_length->{$sample}); + + + for my $p (0..$#pairs){ + + if (!exists $pair{$pairs[$p]}){ + + if($same_chr){ + + $pair{$pairs[$p]}->{0}=[ $chr1, $start1[$p]-1, $end2[$p], $pairs[$p], 0, $str{$strand1[$p]}, + $start1[$p]-1, $end2[$p], $color, + 2, $tag_length->{$sample}->{$ends_order1[$p]}.",".$tag_length->{$sample}->{$ends_order2[$p]}, "0,".($start2[$p]-$start1[$p]) ]; + $pt{$n}=$pair{$pairs[$p]}->{0}; + $n++; + + }else{ + + $pair{$pairs[$p]}->{1}=[ $chr1, $start1[$p]-1, $end1[$p] , $pairs[$p]."/1", 0, $str{$strand1[$p]}, + $start1[$p]-1, $end1[$p], $color, + 1, $tag_length->{$sample}->{$ends_order1[$p]}, 0]; + $pt{$n}=$pair{$pairs[$p]}->{1}; + $n++; + + + $pair{$pairs[$p]}->{2}=[ $chr2, $start2[$p]-1, $end2[$p], $pairs[$p]."/2", 0, $str{$strand2[$p]}, + $start2[$p]-1, $end2[$p], $color, + 1, $tag_length->{$sample}->{$ends_order2[$p]}, 0]; + $pt{$n}=$pair{$pairs[$p]}->{2}; + $n++; + } + }else{ + + if($same_chr){ + ${$pair{$pairs[$p]}->{0}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{0}}[8]}); + }else{ + ${$pair{$pairs[$p]}->{1}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{1}}[8]}); + ${$pair{$pairs[$p]}->{2}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{2}}[8]}); + } + } + } + } + close LINKS; + + my $nb_pairs=$n-1; + + open BED, ">$bed_file" or die "$0: can't write in the output: $bed_file :$!\n"; + print BED "track name=\"$bed_file\" description=\"mate pairs involved in links\" ". + "visibility=2 itemRgb=\"On\"\n"; + + for my $i (1..$nb_pairs){ + print BED join("\t",@{$pt{$i}})."\n"; + } + + close BED; + + print LOG "-- output created: $bed_file\n"; + + undef %pair; + undef %pt; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub deleteBadOrderSensePairs{ + + my (@tab)=@_; + my @tab2; + + foreach my $v (@tab){ + + $v=~s/[\(\)]//g; + push(@tab2,$v) if($v!~/[\$\*\@]$/); + } + return @tab2; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub getAllEntries{ + my (@tab)=split (/,/,$_[0]); + my @tab2; + + foreach my $v (@tab){ + + $v=~s/[\(\)]//g; + push(@tab2,$v); + } + return @tab2; +}#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub getAllEntriesWOspecialChar{ + my (@tab)=split (/,/,$_[0]); + my @tab2; + + foreach my $v (@tab){ + + $v=~s/[\(\)\$\*\@]//g; + push(@tab2,$v); + } + return @tab2; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub links2SVfile{ + + my($links_file,$sv_file)=@_; + + print LOG "# Converting to the sv output table...\n"; + open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n"; + open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n"; + + my @header=qw(chr_type SV_type BAL_type chromosome1 start1-end1 average_dist + chromosome2 start2-end2 nb_pairs score_strand_filtering score_order_filtering score_insert_size_filtering + final_score breakpoint1_start1-end1 breakpoint2_start2-end2); + + my $nb_links=0; + + while (){ + + my @t=split; + my @sv=(); + my $sv_type="-"; + my $strand_ratio="-"; + my $eq_ratio="-"; + my $eq_type="-"; + my $insert_ratio="-"; + my $link="-"; + my ($bk1, $bk2)=("-","-"); + my $score="-"; + + my ($chr1,$start1,$end1)=($t[0],$t[1],$t[2]); + my ($chr2,$start2,$end2)=($t[3],$t[4],$t[5]); + my $nb_pairs=$t[6]; + $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i); + $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i); + my $chr_type=($chr1 eq $chr2)? "INTRA":"INTER"; + + #if strand filtering + if (defined $t[16]){ + #if inter-chr link + $sv_type=$t[16]; + if(defined $t[17] && $t[17]=~/^(\d+)\/(\d+)$/){ + $strand_ratio=floor($1/$2*100)."%"; + $score=$t[18]; + } + if(defined $t[18] && $t[18]=~/^(\d+)\/(\d+)$/){ + #if intra-chr link with insert size filtering + $strand_ratio=floor($1/$2*100)."%"; + $link=floor($t[17]); + if($sv_type!~/^INV/){ + $insert_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/); + $score=$t[20]; + }else{ + $score=$t[19]; + } + } + } + + if(defined $t[18] && ($t[18] eq "UNBAL" || $t[18] eq "BAL")){ + + #if strand and order filtering only and/or interchr link + $eq_type=$t[18]; + $eq_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/); + ($bk1, $bk2)=($t[20],$t[21]); + foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;} + $score=$t[22]; + + }elsif(defined $t[19] && ($t[19] eq "UNBAL" || $t[19] eq "BAL")){ + + #if all three filtering + $link=floor($t[17]); + $eq_type=$t[19]; + $eq_ratio=floor($1/$2*100)."%" if($t[20]=~/^(\d+)\/(\d+)$/); + + if(defined $t[21] && $t[21]=~/^(\d+)\/(\d+)$/){ + $insert_ratio=floor($1/$2*100)."%"; + ($bk1, $bk2)=($t[22],$t[23]); + $score=$t[24]; + + }else{ + ($bk1, $bk2)=($t[21],$t[22]); + $score=$t[23]; + } + foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;} + + } + + + push(@sv, $chr_type, $sv_type,$eq_type); + push(@sv,"$chr1\t$start1-$end1"); + push(@sv, $link); + push(@sv,"$chr2\t$start2-$end2", + $nb_pairs,$strand_ratio,$eq_ratio,$insert_ratio, decimal($score,4), $bk1, $bk2); + + + print SV join("\t",@sv)."\n"; + } + + close LINKS; + close SV; + + system "sort -k 9,9nr -k 13,13nr $sv_file > $sv_file.sorted"; + + open SV, "<".$sv_file.".sorted" or die "$0: can't open in the output: $sv_file".".sorted :$!\n"; + my @links=; + close SV; + + open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n"; + + print SV join("\t",@header)."\n"; + print SV @links; + close SV; + + unlink($sv_file.".sorted"); + + print LOG "-- output created: $sv_file\n"; + +} +#------------------------------------------------------------------------------# +sub densityCalculation{ + + my ($chr,$chrID,$file,$tag_length,$window_dist,$step,$mates_file,$mates_file_ref,$density_file,$input_format)=@_; + + my @sfile=split(/\./,$$mates_file[$file]); + my $fchr=$sfile[$#sfile]; + + my $fh = new FileHandle; + + my %density; + my %density_ref; + my @ratio; + my ($cov,$cov_ref); + + #FREQUENCY CALCULATION PROCEDURE + print LOG "# $fchr : Frequency calculation procedure...\n"; + &FreqCalculation(\%density,$chr,$chrID,$tag_length,$window_dist,$step,$$mates_file[$file],$input_format); + &FreqCalculation(\%density_ref,$chr,$chrID,$tag_length,$window_dist,$step,$$mates_file_ref[$file],$input_format); + + #MAKING RATIO AND OUTPUT + print LOG "\# Ratio calculation procedure...\n"; + $density_file=~s/\/mates\//\/density\//; + $fh->open(">".$density_file) or die "$0: can't write in the output ".$density_file." :$!\n"; + + foreach my $k (1..$chr->{nb_chrs}){ + foreach my $frag (1..$chr->{$k}->{nb_frag}){ + + @ratio= ($chr->{$k}->{name}, + (${$chr->{$k}->{$frag}}[0]+1), + (${$chr->{$k}->{$frag}}[1]+1)); + + $cov=(exists $density{$k}{$frag}->{count})? $density{$k}{$frag}->{count}:0; + $cov_ref=(exists $density_ref{$k}{$frag}->{count})? $density_ref{$k}{$frag}->{count}:0; + + push(@ratio,$cov,$cov_ref); + push(@ratio,log($cov/$cov_ref)) if($cov && $cov_ref); + push(@ratio,-log($cov_ref+1)) if(!$cov && $cov_ref); + push(@ratio,log($cov+1)) if($cov && !$cov_ref); + next if(!$cov && !$cov_ref); + + print $fh join("\t",@ratio)."\n"; + } + } + + $fh->close; + print LOG "-- output created: $density_file\n"; + + undef %density; + undef %density_ref; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub FreqCalculation{ + + my ($density,$chr,$chrID,$tag_length,$window_dist,$step,$mates_file,$input_format) = @_; + + my @sfile=split(/\./,$mates_file); + my $fchr=$sfile[$#sfile]; + my $fh = new FileHandle; + + my $nb_windows=0; + my $warn=100000; + my $record=0; + my %pair; + + my ($sumX,$sumX2) = (0,0); + + print LOG "\# Frequency calculation for $mates_file...\n"; + + if ($mates_file =~ /.gz$/) { + $fh->open("gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; + }elsif($mates_file =~ /.bam$/){ + o$fh->open("$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY + }else{ + $fh->open("<".$mates_file) or die "$0: can't open ".$mates_file.":$!\n"; + } + + while(<$fh>){ + + my @t=split; + my $mate=$t[0]; + + my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1, $end_order_read2,); + + next if(exists $pair{$mate}); + + next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2,\$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length)); + + next unless (exists $chrID->{$chr_read1} || exists $chrID->{$chr_read2}); + ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); + + $pair{$mate}=undef; + $record++; + + my ($coord_start_read1,$coord_end_read1, $coord_start_read2,$coord_end_read2); + + recupCoords($firstbase_read1,\$coord_start_read1,\$coord_end_read1,$tag_length->{$end_order_read1},$input_format); + recupCoords($firstbase_read2,\$coord_start_read2,\$coord_end_read2,$tag_length->{$end_order_read2},$input_format); + + my $length = abs($coord_start_read1-$coord_start_read2); + $sumX += $length; #add to sum and sum^2 for mean and variance calculation + $sumX2 += $length*$length; + + for(my $i=1;$i<=$chr->{$chr_read1}->{'nb_frag'};$i++){ + + if (abs ($coord_start_read1-${$chr->{$chr_read1}->{$i}}[0]) <= $window_dist){ + + &addToDensity($density,$chr_read1,$i,\$nb_windows) + if(overlap($coord_start_read1,$coord_end_read2,${$chr->{$chr_read1}->{$i}}[0],${$chr->{$chr_read1}->{$i}}[1])); + + }else{ + + $i=getNextFrag($coord_start_read1,$i,${$chr->{$chr_read1}->{$i}}[0],$chr->{$chr_read1}->{nb_frag},$window_dist,$step); + } + } + + if($record>=$warn){ + print LOG "-- $warn mate-pairs analysed - $nb_windows points created\n"; + $warn+=100000; + } + } + $fh->close; + + print LOG "-- $fchr : Total : $record mate-pairs analysed - $nb_windows points created\n"; + + if($record>0){ + + my $mu = $sumX/$record; + my $sigma = sqrt($sumX2/$record - $mu*$mu); + print LOG "-- $fchr : mu length = $mu, sigma length = $sigma\n"; + } + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub ratio2segdup{ + + my($id,$density_file,$segdup_file)=@_; + + print LOG "# Converting to circos format...\n"; + + open RATIO, "<$density_file" or die "$0: can't open $density_file :$!\n"; + open SEGDUP, ">$segdup_file" or die "$0: can't write in the output: $segdup_file :$!\n"; + + while(){ + chomp; + my ($chr1,$start1,$end1,$ratio)=(split /\t/)[0,1,2,5]; + print SEGDUP "$id$chr1\t$start1\t$end1\t$ratio\n"; + } + + close RATIO; + close SEGDUP; + print LOG "-- output created: $segdup_file\n"; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub ratio2bedfile{ + + my($density_file,$bed_file)=@_; + + print LOG "# Converting to bedGraph format...\n"; + + open RATIO, "<$density_file" or die "$0: can't open $density_file :$!\n"; + open BED, ">$bed_file" or die "$0: can't write in the output: $bed_file :$!\n"; + print BED "track type=bedGraph name=\"$bed_file\" description=\"log ratios for cnv detection\" ". + "visibility=2 color=255,0,0 alwaysZero=\"On\"\n"; + + while(){ + chomp; + my ($chr1,$start1,$end1,$ratio)=(split /\t/)[0,1,2,5]; + $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/); + print BED "$chr1\t".($start1-1)."\t$end1\t$ratio\n"; + } + + close RATIO; + close BED; + print LOG "-- output created: $bed_file\n"; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub inverseSense{ + + my $mate_sense=$_[0]; + my %reverse=( 'F' => 'R' , 'R' => 'F' , + 'FF' => 'RR', 'RR' => 'FF', + 'FR' => 'RF', 'RF' => 'FR'); + return $reverse{$mate_sense}; +} + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub getNextFrag{ + + my ($read_start,$frag_num,$frag_start,$frag_last,$window_dist,$step)=@_; + + my $how_far = $read_start-$frag_start; + my $nb_windows_toskip; + + if($how_far>0){ + + $nb_windows_toskip=($how_far/$step)-($window_dist/$step); + $nb_windows_toskip=~ s/\..*//; + $nb_windows_toskip=0 if($nb_windows_toskip<0); + return ($frag_num + $nb_windows_toskip); + } + return $frag_last; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub getColor{ + + my($count,$hcolor,$format)=@_; + for my $col ( keys % { $hcolor} ) { + return $col if($count>=$hcolor->{$col}->[0] && $count<=$hcolor->{$col}->[1]); + } + return "white" if($format eq "circos"); + return "255,255,255" if($format eq "bed"); +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub recupCoords{ + + my($c_hit,$cs_hit,$ce_hit,$tag_length,$input_format)=@_; + my $strand = 'F'; + + if ($c_hit=~s/^\-//) { + $strand='R'; + $$cs_hit=$c_hit; + $$ce_hit=$c_hit-($tag_length-1); + }else{ + $$cs_hit=$c_hit; + $$ce_hit=$c_hit+($tag_length-1); + } + return $strand; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub overlap { + my($cs_hit,$ce_hit,$cs_region,$ce_region)=@_; + if( (($cs_hit < $cs_region) && ($ce_hit < $cs_region )) || (($cs_hit > $ce_region) && ($ce_hit > $ce_region )) ) { + return 0; + } + return 1; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub makeLink { + + my ($link,$chr1,$frag1,$chr2,$frag2,$mt,$nb)=@_; + + if($chr1>$chr2){ + ($chr1,$chr2)= ($chr2,$chr1); + ($frag1,$frag2)= ($frag2,$frag1); + } + + if($chr1 == $chr2){ + if($frag1>$frag2){ + ($frag1,$frag2)= ($frag2,$frag1); + } + } + + if(!exists $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}){ + $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}=$mt; + $$nb++; + }elsif($link->{$chr1}->{$chr2}->{$frag1}->{$frag2}!~/(^|,)$mt(,|$)/){ + $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}.=",$mt"; + } +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#fonction of adding the read to the density profile +sub addToDensity { + + my ($density,$chr1,$frag1,$nb)=@_; + + if(!exists $density->{$chr1}->{$frag1}->{count}){ + $density->{$chr1}->{$frag1}->{count}=1; + $$nb++; + }else{ + $density->{$chr1}->{$frag1}->{count}++; + } +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub floor { + my $nb = $_[0]; + $nb=~ s/\..*//; + return $nb; +} +#------------------------------------------------------------------------------# +sub decimal{ + + my $num=shift; + my $digs_to_cut=shift; + + $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/); + + return $num; +} + +#------------------------------------------------------------------------------# +sub max { + + my($max) = shift(@_); + foreach my $temp (@_) { + $max = $temp if $temp > $max; + } + return($max); +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub min { + + my($min) = shift(@_); + foreach my $temp (@_) { + $min = $temp if $temp < $min; + } + return($min); +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub sortTablebyIndex{ + my ($tab1,$tab2)=@_; + my @tab3; + + foreach my $i (@$tab1){ + $tab3[$i]=$$tab2[$$tab1[$i]]; + } + return @tab3; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub round { + my $number = shift || 0; + my $dec = 10 ** (shift || 0); + return int( $dec * $number + .5 * ($number <=> 0)) / $dec; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub getUniqueTable{ + + my (@tab)=@_; + my (%saw,@out)=(); + undef %saw; + return sort(grep(!$saw{$_}++, @tab)); +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub catFiles { + + unlink("$_[1]") if(exists $_[1]); + system qq( cat "$_" >> "$_[1]" ) for @{$_[0]}; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#check if the configuration file is correct +sub validateconfiguration{ + + my %conf=%{$_[0]}; + my $list_prgs="@ARGV"; + + my @general_params=qw(input_format mates_orientation read1_length read2_length mates_file cmap_file); + my @detection_params=qw(split_mate_file window_size step_length split_mate_file); + my @filtering_params=qw(split_link_file nb_pairs_threshold strand_filtering split_link_file); + my @circos_params=qw(organism_id colorcode); + my @bed_params=qw(colorcode); + my @compare_params=qw(list_samples file_suffix); + + foreach my $dir ($conf{general}{output_dir},$conf{general}{tmp_dir}){ + + unless (defined($dir)) { + $dir = "."; + } + unless (-d $dir){ + mkdir $dir or die; + } + $dir.="/" if($dir!~/\/$/); + } + + unless (defined($conf{general}{num_threads})) { + $conf{general}{num_threads} = 1; + } + $conf{general}{num_threads}=24 if($conf{general}{num_threads}>24); + + if($list_prgs!~/links2compare/){ + + foreach my $p (@general_params){ + die("Error Config : The parameter \"$p\" is not defined\n") if (!defined $conf{general}{$p}); + } + + $conf{general}{input_format}="sam" if($conf{general}{input_format} eq "bam"); + + unless (defined($conf{general}{sv_type})) { + $conf{general}{sv_type} = "all"; + } + + $conf{general}{read_lengths}={ 1=> $conf{general}{read1_length}, 2=> $conf{general}{read2_length}}; + } + + if($list_prgs=~/(linking|cnv)/){ + + foreach my $p (@detection_params){ + die("Error Config : The parameter \"$p\" is not defined\n") if (!defined $conf{detection}{$p}); + } + + die("Error Config : The parameter \"mates_file_ref\" is not defined\n") if($list_prgs=~/cnv/ && !defined $conf{detection}{mates_file_ref}); + + if($conf{detection}{step_length}>$conf{detection}{window_size}){ + die("Error Config : Parameter \"step_length\" should not exceed \"window size\"\n"); + } + + unless (-d $conf{general}{tmp_dir}."/mates"){ + mkdir $conf{general}{tmp_dir}."/mates" or die; + } + + if($list_prgs=~/linking/){ + unless (-d $conf{general}{tmp_dir}."/links"){ + mkdir $conf{general}{tmp_dir}."/links" or die; + } + } + if($list_prgs=~/cnv/){ + unless (-d $conf{general}{tmp_dir}."/density"){ + mkdir $conf{general}{tmp_dir}."/density" or die; + } + } + + } + + if($list_prgs=~/filtering/){ + + foreach my $p (@filtering_params) { + die("Error Config : The filtering parameter \"$p\" is not defined\n") if (!defined $conf{filtering}{$p}); + + } + + if(defined($conf{filtering}{chromosomes})) { + my @chrs=split(",",$conf{filtering}{chromosomes}); + my $exclude=($chrs[0]=~/^\-/)? 1:0; + for my $chrName (@chrs){ + + die("Error Config : The filtering parameter \"chromosomes\" is not valid\n") + if(($chrName!~/^\-/ && $exclude) || ($chrName=~/^\-/ && !$exclude)); + + } + } + + if (( $conf{filtering}{order_filtering} )&& !$conf{filtering}{strand_filtering}) { + die("Error Config : The parameter strand_filtering is set to \"0\" while order_filtering is selected". + "\nChange strand_filtering to \"1\" if you want to use the order filtering\n"); + } + if (( !defined($conf{filtering}{mu_length}) || !defined($conf{filtering}{sigma_length}) )&& $conf{filtering}{order_filtering}) { + die("Error Config : You should set parameters \"mu_length\" and \"sigma_length\" to use order filtering\n"); + } + if (( $conf{filtering}{insert_size_filtering} )&& !$conf{filtering}{strand_filtering}) { + die("Error Config : The parameter strand_filtering is set to \"0\" while insert_size_filtering is selected". + "\nChange strand_filtering to \"1\" if you want to use the insert size filtering\n"); + } + if (( !defined($conf{filtering}{mu_length}) || !defined($conf{filtering}{sigma_length}) )&& $conf{filtering}{insert_size_filtering}) { + die("Error Config : You should set parameters \"mu_length\" and \"sigma_length\" to use discriminate insertions from deletions\n"); + } + + if (!defined($conf{filtering}{indel_sigma_threshold})) { + $conf{filtering}{indel_sigma_threshold} = 2; + } + if (!defined($conf{filtering}{dup_sigma_threshold})) { + $conf{filtering}{dup_sigma_threshold} = 2; + } + if (!defined($conf{filtering}{singleton_sigma_threshold})) { + $conf{filtering}{singleton_sigma_threshold} = 4; + } + + if (!defined($conf{filtering}{nb_pairs_order_threshold})) { + $conf{filtering}{nb_pairs_order_threshold} = 1; + } + + if (!defined($conf{filtering}{final_score_threshold})) { + $conf{filtering}{final_score_threshold} = 0.8; + } + + if ($conf{filtering}{nb_pairs_order_threshold}>$conf{filtering}{nb_pairs_threshold}) { + die("Error Config : Parameter \"nb_pairs_order_threshold\" should not exceed \"nb_pairs_threshold\"\n"); + } + + } + + if($list_prgs=~/2circos$/){ + foreach my $p (@circos_params) { + next if($list_prgs=~/^ratio/ && $p eq "colorcode"); + die("Error Config : The circos parameter \"$p\" is not defined\n") if (!defined $conf{circos}{$p}); + } + } + + if($list_prgs=~/2bed$/){ + foreach my $p (@bed_params) { + die("Error Config : The bed parameter \"$p\" is not defined\n") if (!defined $conf{bed}{$p}); + } + } + + if($list_prgs=~/links2compare/){ + foreach my $p (@compare_params) { + die("Error Config : The compare parameter \"$p\" is not defined\n") if (!defined $conf{compare}{$p}); + } + + unless (defined($conf{compare}{same_sv_type})) { + $conf{compare}{same_sv_type} = 0; + } + + unless (defined($conf{compare}{min_overlap})) { + $conf{compare}{min_overlap} = 1E-9; + } + + if($conf{compare}{circos_output}){ + foreach my $p (@circos_params) { + next if($list_prgs=~/^ratio/ && $p eq "colorcode"); + die("Error Config : The circos parameter \"$p\" is not defined\n") if (!defined $conf{circos}{$p}); + } + } + if($conf{compare}{bed_output}){ + foreach my $p (@bed_params) { + die("Error Config : The bed parameter \"$p\" is not defined\n") if (!defined $conf{bed}{$p}); + } + die("Error Config : The compare parameter \"list_read_lengths\" is not defined\n") if (!defined $conf{compare}{list_read_lengths}); + + my @samples=split(",",$conf{compare}{list_samples}); + my @read_lengths=split(",",$conf{compare}{list_read_lengths}); + for my $i (0..$#samples){ + my @l=split("-",$read_lengths[$i]); + $conf{compare}{read_lengths}{$samples[$i]}={ 1=> $l[0], 2=> $l[1]}; + } + } + } + + +} +#------------------------------------------------------------------------------# +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# diff -r c284618dd8da -r 091714bd75a0 SVDetect_r0.8b_galaxy/svdetect/SVDetect_run_parallel.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SVDetect_r0.8b_galaxy/svdetect/SVDetect_run_parallel.xml Tue Jan 22 06:20:22 2013 -0500 @@ -0,0 +1,324 @@ + + +and identify structural variants + +SVDetect_run_parallel.pl + +#if $getLinks.linking == "linking" +linking + +#end if +#if $getFilteredLinks.filtering == "filtering" +filtering + +#if str($getFilteredLinks.links2SV) == "create" +links2SV +-out3 '$sv_file' +#end if +#if $getFilteredLinks.file_conversion.file_conversion_select=="convert" and str($getFilteredLinks.file_conversion.links2circos) == "create" +links2circos +-out4 '$circos_file' +#end if +#if $getFilteredLinks.file_conversion.file_conversion_select=="convert" and str($getFilteredLinks.file_conversion.links2bed) == "create" +links2bed +-out5 '$bed_file' +#end if +#end if +-conf '$config_file' +-l '$log_file' +-N '$sample_name' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ( + getFilteredLinks['filtering']=="filtering" and + getFilteredLinks['links2SV'] is True + ) + + + + ( + getFilteredLinks['filtering']=="filtering" and + getFilteredLinks['file_conversion']['file_conversion_select']=="convert" and + getFilteredLinks['file_conversion']['links2circos'] is True + ) + + + + ( + getFilteredLinks['filtering']=="filtering" and + getFilteredLinks['file_conversion']['file_conversion_select']=="convert" and + getFilteredLinks['file_conversion']['links2bed'] is True + ) + + + + + + + + + +<general> +input_format = bam +sv_type = ${sv_type} +mates_orientation=${mates_orientation} +read1_length=${read1_length} +read2_length=${read2_length} +mates_file=${mates_file} +cmap_file=${cmap_file} +tmp_dir=$__new_file_path__/svdetect/tmp +output_dir=$__new_file_path__/svdetect +num_threads=8 +</general> + +#if $getLinks.linking == "linking" +<detection> +#if str($getLinks.splitmate) == "split" +split_mate_file=1 +#else +split_mate_file=0 +#end if +window_size=${getLinks.window_size} +step_length=${getLinks.step_length} +</detection> +#end if + +#if $getFilteredLinks.filtering == "filtering" +<filtering> +#if str($getFilteredLinks.splitlink) == "split" +split_link_file=1 +#else +split_link_file=0 +#end if +#if str($getFilteredLinks.chromosomes) != "" +chromosomes=${getFilteredLinks.chromosomes} +#end if +nb_pairs_threshold=${getFilteredLinks.nb_pairs_threshold} +#if $getFilteredLinks.filter1.strand_filtering == "strand" +strand_filtering=1 +final_score_threshold=${getFilteredLinks.filter1.final_score_threshold} +#if $getFilteredLinks.filter1.filter2.order_filtering == "order" +order_filtering=1 +mu_length=${getFilteredLinks.filter1.filter2.mu_length} +sigma_length=${getFilteredLinks.filter1.filter2.sigma_length} +nb_pairs_order_threshold=${getFilteredLinks.filter1.filter2.nb_pairs_order_threshold} +#if $getFilteredLinks.filter1.filter2.filter3.insert_size_filtering == "insert" +insert_size_filtering=1 +indel_sigma_threshold=${getFilteredLinks.filter1.filter2.filter3.indel_sigma_threshold} +dup_sigma_threshold=${getFilteredLinks.filter1.filter2.filter3.dup_sigma_threshold} +singleton_sigma_threshold=${getFilteredLinks.filter1.filter2.filter3.singleton_sigma_threshold} +#else +insert_size_filtering=0 +#end if +#else +order_filtering=0 +#end if +#else +strand_filtering=0 +#end if +</filtering> +#end if + +#if $getFilteredLinks.filtering == "filtering" +#if $getFilteredLinks.file_conversion.file_conversion_select == "convert" +#if str($getFilteredLinks.file_conversion.links2circos) == "create" +<circos> +organism_id=${getFilteredLinks.file_conversion.organism_id} +<colorcode> +#for $color_repeat in $getFilteredLinks.file_conversion.color_code +${color_repeat.color}=${color_repeat.interval} +#end for +</colorcode> +</circos> +#end if +#if str($getFilteredLinks.file_conversion.links2bed) == "create" +<bed> +<colorcode> +#for $color_repeat in $getFilteredLinks.file_conversion.color_code +#if str($color_repeat.color)== "grey" +190,190,190=${color_repeat.interval} +#end if +#if str($color_repeat.color)== "black" +0,0,0=${color_repeat.interval} +#end if +#if str($color_repeat.color)== "blue" +0,0,255=${color_repeat.interval} +#end if +#if str($color_repeat.color)== "green" +0,255,0=${color_repeat.interval} +#end if +#if str($color_repeat.color)== "purple" +153,50,205=${color_repeat.interval} +#end if +#if str($color_repeat.color)== "orange" +255,140,0=${color_repeat.interval} +#end if +#if str($color_repeat.color)== "red" +255,0,0=${color_repeat.interval} +#end if +#end for +</colorcode> +</bed> +#end if +#end if +#end if + + + + +**What it does** + +SVDetect - Version : 0.8b + +Parallel version (nCPU=8) + +SVDetect is a application for the isolation and the type prediction of intra- and inter-chromosomal rearrangements from paired-end/mate-pair sequencing data provided by the high-throughput sequencing technologies + +This tool aims to identifying structural variations (SVs) with both clustering and sliding-window strategies, and helping in their visualization at the genome scale. +SVDetect is compatible with SOLiD and Illumina (>=1.3) reads. + +Manual documentation available at the http://svdetect.sourceforge.net/Site/Manual.html + +----- + +.. class:: infomark + +Contact Bruno Zeitouni (svdetect@curie.fr) for any questions or concerns about the Galaxy implementation of SVDetect. + + + + diff -r c284618dd8da -r 091714bd75a0 SVDetect_r0.8b_galaxy/svdetect/circos_graph.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SVDetect_r0.8b_galaxy/svdetect/circos_graph.xml Tue Jan 22 06:20:22 2013 -0500 @@ -0,0 +1,290 @@ + + +plots + +circos/bin/circos + +-conf '$circos_config_file' +-outputfile '${outputfile}.dat' +-png + +> '$log_file' + +; + +rm '$outputfile'; ln -s '${outputfile}.png' '$outputfile' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +<ideogram> + +<spacing> + +default = 5u +break = 1u + +axis_break_at_edge = yes +axis_break = yes +axis_break_style = 2 + +<break_style 1> +stroke_color = black +fill_color = blue +thickness = 0.25r +stroke_thickness = 2 +</break> + +<break_style 2> +stroke_color = black +stroke_thickness = 3p +thickness = 1.5r +</break> + +</spacing> + +## thickness (px) of chromosome ideogram +thickness = 100p +stroke_thickness = 2 +## ideogram border color +stroke_color = black +fill = yes +## the default chromosome color is set here and any value +## defined in the karyotype file overrides it +fill_color = black + +## fractional radius position of chromosome ideogram within image +radius = 0.85r +show_label = yes +label_with_tag = yes +label_font = condensedbold +label_radius = dims(ideogram,radius) + 0.075r +label_size = 60p + +## cytogenetic bands +band_stroke_thickness = 2 + +## show_bands determines whether the outline of cytogenetic bands +## will be seen +show_bands = yes +## in order to fill the bands with the color defined in the karyotype +## file you must set fill_bands +fill_bands = yes + +</ideogram> + + + + + +show_ticks = yes +show_tick_labels = yes + +<ticks> +radius = dims(ideogram,radius_outer) +multiplier = 1e-6 + +<tick> +spacing = 0.5u +size = 2p +thickness = 2p +color = grey +show_label = no +label_size = 12p +label_offset = 0p +format = %.2f +</tick> + +<tick> +spacing = 1u +size = 3p +thickness = 2p +color = dgrey +show_label = no +label_size = 12p +label_offset = 0p +format = %.2f +</tick> + +<tick> +spacing = 5u +size = 5p +thickness = 2p +color = black +show_label = yes +label_size = 16p +label_offset = 0p +format = %d +</tick> + +<tick> +spacing = 10u +size = 8p +thickness = 2p +color = black +show_label = yes +label_size = 20p +label_offset = 5p +format = %d +</tick> +</ticks> + + + + +<colors> +<<include etc/colors.conf>> +</colors> + +<fonts> +<<include etc/fonts.conf>> +</fonts> + +<<include $ideogram_config_file>> +<<include $ticks_config_file>> + +karyotype = $karyotype + +<image> +24bit = yes +##png = yes +##svg = no +## radius of inscribed circle in image +radius = 1500p +background = white +## by default angle=0 is at 3 o'clock position +angle_offset = -90 +#angle_orientation = counterclockwise + +auto_alpha_colors = yes +auto_alpha_steps = 5 +</image> + +chromosomes_units= $chromosomes_units + +#if str($chromosomes)=="" +chromosomes_display_default = yes +#else +chromosomes_display_default = no +chromosomes = $chromosomes +#end if + +<links> + +z = 0 +radius = 0.95r +bezier_radius = 0.2r + +<link segdup> +show = yes +color = dgrey_a5 +thickness = 2 +file = $link_file +record_limit = 1000 +</link> + +</links> + + +anglestep = 0.5 +minslicestep = 10 +beziersamples = 40 +debug = no +warnings = no +imagemap = no + +units_ok = bupr +units_nounit = n + + + + +**What it does** + +Circos + +Manual documentation available at the http://circos.ca/ + + +**Example of link segdup file** + +segdup file:: + + 1 hs1 1077096 1078746 color=red + 1 hs1 1080923 1082805 color=red + 2 hs1 1137684 1137961 color=red + 2 hs3 1138138 1138423 color=red + 3 hs11 1169417 1170000 color=red + 3 hs11 1170025 1170975 color=red + 4 hs11 1222480 1224271 color=green + 4 hs11 1223328 1225675 color=green + 5 hs12 1223336 1225812 color=grey + 5 hs13 1224709 1227633 color=grey + 6 hs11 1223621 1226460 color=red + 6 hs11 1224918 1227633 color=red + 7 hs11 1399510 1401513 color=white + 7 hs11 1401628 1403697 color=white + 8 hs15 1652045 1653746 color=red + 8 hs15 1657167 1658940 color=red + 9 hs11 165333 165887 color=white + 9 hs11 165981 168016 color=white + 10 hs11 1702700 1702841 color=red + 10 hs11 1702903 1703057 color=red + 11 hs11 1912272 1915186 color=white + 11 hs11 1937111 1939824 color=white + 12 hs11 1983211 1983355 color=red + 12 hs11 1983591 1983748 color=red + 13 hs11 2913657 2913898 color=white + 13 hs11 2914048 2914341 color=white + 14 hs11 3090593 3090749 color=purple + 14 hs11 3090709 3090864 color=purple + 15 hs21 3466365 3466434 color=red + 15 hs21 3466554 3466620 color=red + 16 hsX 3603073 3603321 color=white + 16 hsX 3603295 3603520 color=white + + + +----- + +.. class:: infomark + +Contact Bruno Zeitouni (svdetect@curie.fr) for any questions or concerns about the Galaxy implementation of Circos. + + + + + diff -r c284618dd8da -r 091714bd75a0 external/karyotype.human_hg19.txt --- a/external/karyotype.human_hg19.txt Tue Nov 06 10:06:26 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,888 +0,0 @@ -# Human chromsome sizes and cytogenetic band position. -# Data based on hg19 assembly. -chr - hs1 1 0 249250621 green -chr - hs2 2 0 243199373 green -chr - hs3 3 0 198022430 green -chr - hs4 4 0 191154276 green -chr - hs5 5 0 180915260 green -chr - hs6 6 0 171115067 green -chr - hs7 7 0 159138663 green -chr - hs8 8 0 146364022 green -chr - hs9 9 0 141213431 green -chr - hs10 10 0 135534747 green -chr - hs11 11 0 135006516 green -chr - hs12 12 0 133851895 green -chr - hs13 13 0 115169878 green -chr - hs14 14 0 107349540 green -chr - hs15 15 0 102531392 green -chr - hs16 16 0 90354753 green -chr - hs17 17 0 81195210 green -chr - hs18 18 0 78077248 green -chr - hs19 19 0 59128983 green -chr - hs20 20 0 63025520 green -chr - hs21 21 0 48129895 green -chr - hs22 22 0 51304566 green -chr - hsX X 0 155270560 green -chr - hsY Y 0 59373566 green -band hs1 p36.33 p36.33 0 2300000 gneg -band hs1 p36.32 p36.32 2300000 5400000 gpos25 -band hs1 p36.31 p36.31 5400000 7200000 gneg -band hs1 p36.23 p36.23 7200000 9200000 gpos25 -band hs1 p36.22 p36.22 9200000 12700000 gneg -band hs1 p36.21 p36.21 12700000 16200000 gpos50 -band hs1 p36.13 p36.13 16200000 20400000 gneg -band hs1 p36.12 p36.12 20400000 23900000 gpos25 -band hs1 p36.11 p36.11 23900000 28000000 gneg -band hs1 p35.3 p35.3 28000000 30200000 gpos25 -band hs1 p35.2 p35.2 30200000 32400000 gneg -band hs1 p35.1 p35.1 32400000 34600000 gpos25 -band hs1 p34.3 p34.3 34600000 40100000 gneg -band hs1 p34.2 p34.2 40100000 44100000 gpos25 -band hs1 p34.1 p34.1 44100000 46800000 gneg -band hs1 p33 p33 46800000 50700000 gpos75 -band hs1 p32.3 p32.3 50700000 56100000 gneg -band hs1 p32.2 p32.2 56100000 59000000 gpos50 -band hs1 p32.1 p32.1 59000000 61300000 gneg -band hs1 p31.3 p31.3 61300000 68900000 gpos50 -band hs1 p31.2 p31.2 68900000 69700000 gneg -band hs1 p31.1 p31.1 69700000 84900000 gpos100 -band hs1 p22.3 p22.3 84900000 88400000 gneg -band hs1 p22.2 p22.2 88400000 92000000 gpos75 -band hs1 p22.1 p22.1 92000000 94700000 gneg -band hs1 p21.3 p21.3 94700000 99700000 gpos75 -band hs1 p21.2 p21.2 99700000 102200000 gneg -band hs1 p21.1 p21.1 102200000 107200000 gpos100 -band hs1 p13.3 p13.3 107200000 111800000 gneg -band hs1 p13.2 p13.2 111800000 116100000 gpos50 -band hs1 p13.1 p13.1 116100000 117800000 gneg -band hs1 p12 p12 117800000 120600000 gpos50 -band hs1 p11.2 p11.2 120600000 121500000 gneg -band hs1 p11.1 p11.1 121500000 125000000 acen -band hs1 q11 q11 125000000 128900000 acen -band hs1 q12 q12 128900000 142600000 gvar -band hs1 q21.1 q21.1 142600000 147000000 gneg -band hs1 q21.2 q21.2 147000000 150300000 gpos50 -band hs1 q21.3 q21.3 150300000 155000000 gneg -band hs1 q22 q22 155000000 156500000 gpos50 -band hs1 q23.1 q23.1 156500000 159100000 gneg -band hs1 q23.2 q23.2 159100000 160500000 gpos50 -band hs1 q23.3 q23.3 160500000 165500000 gneg -band hs1 q24.1 q24.1 165500000 167200000 gpos50 -band hs1 q24.2 q24.2 167200000 170900000 gneg -band hs1 q24.3 q24.3 170900000 172900000 gpos75 -band hs1 q25.1 q25.1 172900000 176000000 gneg -band hs1 q25.2 q25.2 176000000 180300000 gpos50 -band hs1 q25.3 q25.3 180300000 185800000 gneg -band hs1 q31.1 q31.1 185800000 190800000 gpos100 -band hs1 q31.2 q31.2 190800000 193800000 gneg -band hs1 q31.3 q31.3 193800000 198700000 gpos100 -band hs1 q32.1 q32.1 198700000 207200000 gneg -band hs1 q32.2 q32.2 207200000 211500000 gpos25 -band hs1 q32.3 q32.3 211500000 214500000 gneg -band hs1 q41 q41 214500000 224100000 gpos100 -band hs1 q42.11 q42.11 224100000 224600000 gneg -band hs1 q42.12 q42.12 224600000 227000000 gpos25 -band hs1 q42.13 q42.13 227000000 230700000 gneg -band hs1 q42.2 q42.2 230700000 234700000 gpos50 -band hs1 q42.3 q42.3 234700000 236600000 gneg -band hs1 q43 q43 236600000 243700000 gpos75 -band hs1 q44 q44 243700000 249250621 gneg -band hs10 p15.3 p15.3 0 3000000 gneg -band hs10 p15.2 p15.2 3000000 3800000 gpos25 -band hs10 p15.1 p15.1 3800000 6600000 gneg -band hs10 p14 p14 6600000 12200000 gpos75 -band hs10 p13 p13 12200000 17300000 gneg -band hs10 p12.33 p12.33 17300000 18600000 gpos75 -band hs10 p12.32 p12.32 18600000 18700000 gneg -band hs10 p12.31 p12.31 18700000 22600000 gpos75 -band hs10 p12.2 p12.2 22600000 24600000 gneg -band hs10 p12.1 p12.1 24600000 29600000 gpos50 -band hs10 p11.23 p11.23 29600000 31300000 gneg -band hs10 p11.22 p11.22 31300000 34400000 gpos25 -band hs10 p11.21 p11.21 34400000 38000000 gneg -band hs10 p11.1 p11.1 38000000 40200000 acen -band hs10 q11.1 q11.1 40200000 42300000 acen -band hs10 q11.21 q11.21 42300000 46100000 gneg -band hs10 q11.22 q11.22 46100000 49900000 gpos25 -band hs10 q11.23 q11.23 49900000 52900000 gneg -band hs10 q21.1 q21.1 52900000 61200000 gpos100 -band hs10 q21.2 q21.2 61200000 64500000 gneg -band hs10 q21.3 q21.3 64500000 70600000 gpos100 -band hs10 q22.1 q22.1 70600000 74900000 gneg -band hs10 q22.2 q22.2 74900000 77700000 gpos50 -band hs10 q22.3 q22.3 77700000 82000000 gneg -band hs10 q23.1 q23.1 82000000 87900000 gpos100 -band hs10 q23.2 q23.2 87900000 89500000 gneg -band hs10 q23.31 q23.31 89500000 92900000 gpos75 -band hs10 q23.32 q23.32 92900000 94100000 gneg -band hs10 q23.33 q23.33 94100000 97000000 gpos50 -band hs10 q24.1 q24.1 97000000 99300000 gneg -band hs10 q24.2 q24.2 99300000 101900000 gpos50 -band hs10 q24.31 q24.31 101900000 103000000 gneg -band hs10 q24.32 q24.32 103000000 104900000 gpos25 -band hs10 q24.33 q24.33 104900000 105800000 gneg -band hs10 q25.1 q25.1 105800000 111900000 gpos100 -band hs10 q25.2 q25.2 111900000 114900000 gneg -band hs10 q25.3 q25.3 114900000 119100000 gpos75 -band hs10 q26.11 q26.11 119100000 121700000 gneg -band hs10 q26.12 q26.12 121700000 123100000 gpos50 -band hs10 q26.13 q26.13 123100000 127500000 gneg -band hs10 q26.2 q26.2 127500000 130600000 gpos50 -band hs10 q26.3 q26.3 130600000 135534747 gneg -band hs11 p15.5 p15.5 0 2800000 gneg -band hs11 p15.4 p15.4 2800000 10700000 gpos50 -band hs11 p15.3 p15.3 10700000 12700000 gneg -band hs11 p15.2 p15.2 12700000 16200000 gpos50 -band hs11 p15.1 p15.1 16200000 21700000 gneg -band hs11 p14.3 p14.3 21700000 26100000 gpos100 -band hs11 p14.2 p14.2 26100000 27200000 gneg -band hs11 p14.1 p14.1 27200000 31000000 gpos75 -band hs11 p13 p13 31000000 36400000 gneg -band hs11 p12 p12 36400000 43500000 gpos100 -band hs11 p11.2 p11.2 43500000 48800000 gneg -band hs11 p11.12 p11.12 48800000 51600000 gpos75 -band hs11 p11.11 p11.11 51600000 53700000 acen -band hs11 q11 q11 53700000 55700000 acen -band hs11 q12.1 q12.1 55700000 59900000 gpos75 -band hs11 q12.2 q12.2 59900000 61700000 gneg -band hs11 q12.3 q12.3 61700000 63400000 gpos25 -band hs11 q13.1 q13.1 63400000 65900000 gneg -band hs11 q13.2 q13.2 65900000 68400000 gpos25 -band hs11 q13.3 q13.3 68400000 70400000 gneg -band hs11 q13.4 q13.4 70400000 75200000 gpos50 -band hs11 q13.5 q13.5 75200000 77100000 gneg -band hs11 q14.1 q14.1 77100000 85600000 gpos100 -band hs11 q14.2 q14.2 85600000 88300000 gneg -band hs11 q14.3 q14.3 88300000 92800000 gpos100 -band hs11 q21 q21 92800000 97200000 gneg -band hs11 q22.1 q22.1 97200000 102100000 gpos100 -band hs11 q22.2 q22.2 102100000 102900000 gneg -band hs11 q22.3 q22.3 102900000 110400000 gpos100 -band hs11 q23.1 q23.1 110400000 112500000 gneg -band hs11 q23.2 q23.2 112500000 114500000 gpos50 -band hs11 q23.3 q23.3 114500000 121200000 gneg -band hs11 q24.1 q24.1 121200000 123900000 gpos50 -band hs11 q24.2 q24.2 123900000 127800000 gneg -band hs11 q24.3 q24.3 127800000 130800000 gpos50 -band hs11 q25 q25 130800000 135006516 gneg -band hs12 p13.33 p13.33 0 3300000 gneg -band hs12 p13.32 p13.32 3300000 5400000 gpos25 -band hs12 p13.31 p13.31 5400000 10100000 gneg -band hs12 p13.2 p13.2 10100000 12800000 gpos75 -band hs12 p13.1 p13.1 12800000 14800000 gneg -band hs12 p12.3 p12.3 14800000 20000000 gpos100 -band hs12 p12.2 p12.2 20000000 21300000 gneg -band hs12 p12.1 p12.1 21300000 26500000 gpos100 -band hs12 p11.23 p11.23 26500000 27800000 gneg -band hs12 p11.22 p11.22 27800000 30700000 gpos50 -band hs12 p11.21 p11.21 30700000 33300000 gneg -band hs12 p11.1 p11.1 33300000 35800000 acen -band hs12 q11 q11 35800000 38200000 acen -band hs12 q12 q12 38200000 46400000 gpos100 -band hs12 q13.11 q13.11 46400000 49100000 gneg -band hs12 q13.12 q13.12 49100000 51500000 gpos25 -band hs12 q13.13 q13.13 51500000 54900000 gneg -band hs12 q13.2 q13.2 54900000 56600000 gpos25 -band hs12 q13.3 q13.3 56600000 58100000 gneg -band hs12 q14.1 q14.1 58100000 63100000 gpos75 -band hs12 q14.2 q14.2 63100000 65100000 gneg -band hs12 q14.3 q14.3 65100000 67700000 gpos50 -band hs12 q15 q15 67700000 71500000 gneg -band hs12 q21.1 q21.1 71500000 75700000 gpos75 -band hs12 q21.2 q21.2 75700000 80300000 gneg -band hs12 q21.31 q21.31 80300000 86700000 gpos100 -band hs12 q21.32 q21.32 86700000 89000000 gneg -band hs12 q21.33 q21.33 89000000 92600000 gpos100 -band hs12 q22 q22 92600000 96200000 gneg -band hs12 q23.1 q23.1 96200000 101600000 gpos75 -band hs12 q23.2 q23.2 101600000 103800000 gneg -band hs12 q23.3 q23.3 103800000 109000000 gpos50 -band hs12 q24.11 q24.11 109000000 111700000 gneg -band hs12 q24.12 q24.12 111700000 112300000 gpos25 -band hs12 q24.13 q24.13 112300000 114300000 gneg -band hs12 q24.21 q24.21 114300000 116800000 gpos50 -band hs12 q24.22 q24.22 116800000 118100000 gneg -band hs12 q24.23 q24.23 118100000 120700000 gpos50 -band hs12 q24.31 q24.31 120700000 125900000 gneg -band hs12 q24.32 q24.32 125900000 129300000 gpos50 -band hs12 q24.33 q24.33 129300000 133851895 gneg -band hs13 p13 p13 0 4500000 gvar -band hs13 p12 p12 4500000 10000000 stalk -band hs13 p11.2 p11.2 10000000 16300000 gvar -band hs13 p11.1 p11.1 16300000 17900000 acen -band hs13 q11 q11 17900000 19500000 acen -band hs13 q12.11 q12.11 19500000 23300000 gneg -band hs13 q12.12 q12.12 23300000 25500000 gpos25 -band hs13 q12.13 q12.13 25500000 27800000 gneg -band hs13 q12.2 q12.2 27800000 28900000 gpos25 -band hs13 q12.3 q12.3 28900000 32200000 gneg -band hs13 q13.1 q13.1 32200000 34000000 gpos50 -band hs13 q13.2 q13.2 34000000 35500000 gneg -band hs13 q13.3 q13.3 35500000 40100000 gpos75 -band hs13 q14.11 q14.11 40100000 45200000 gneg -band hs13 q14.12 q14.12 45200000 45800000 gpos25 -band hs13 q14.13 q14.13 45800000 47300000 gneg -band hs13 q14.2 q14.2 47300000 50900000 gpos50 -band hs13 q14.3 q14.3 50900000 55300000 gneg -band hs13 q21.1 q21.1 55300000 59600000 gpos100 -band hs13 q21.2 q21.2 59600000 62300000 gneg -band hs13 q21.31 q21.31 62300000 65700000 gpos75 -band hs13 q21.32 q21.32 65700000 68600000 gneg -band hs13 q21.33 q21.33 68600000 73300000 gpos100 -band hs13 q22.1 q22.1 73300000 75400000 gneg -band hs13 q22.2 q22.2 75400000 77200000 gpos50 -band hs13 q22.3 q22.3 77200000 79000000 gneg -band hs13 q31.1 q31.1 79000000 87700000 gpos100 -band hs13 q31.2 q31.2 87700000 90000000 gneg -band hs13 q31.3 q31.3 90000000 95000000 gpos100 -band hs13 q32.1 q32.1 95000000 98200000 gneg -band hs13 q32.2 q32.2 98200000 99300000 gpos25 -band hs13 q32.3 q32.3 99300000 101700000 gneg -band hs13 q33.1 q33.1 101700000 104800000 gpos100 -band hs13 q33.2 q33.2 104800000 107000000 gneg -band hs13 q33.3 q33.3 107000000 110300000 gpos100 -band hs13 q34 q34 110300000 115169878 gneg -band hs14 p13 p13 0 3700000 gvar -band hs14 p12 p12 3700000 8100000 stalk -band hs14 p11.2 p11.2 8100000 16100000 gvar -band hs14 p11.1 p11.1 16100000 17600000 acen -band hs14 q11.1 q11.1 17600000 19100000 acen -band hs14 q11.2 q11.2 19100000 24600000 gneg -band hs14 q12 q12 24600000 33300000 gpos100 -band hs14 q13.1 q13.1 33300000 35300000 gneg -band hs14 q13.2 q13.2 35300000 36600000 gpos50 -band hs14 q13.3 q13.3 36600000 37800000 gneg -band hs14 q21.1 q21.1 37800000 43500000 gpos100 -band hs14 q21.2 q21.2 43500000 47200000 gneg -band hs14 q21.3 q21.3 47200000 50900000 gpos100 -band hs14 q22.1 q22.1 50900000 54100000 gneg -band hs14 q22.2 q22.2 54100000 55500000 gpos25 -band hs14 q22.3 q22.3 55500000 58100000 gneg -band hs14 q23.1 q23.1 58100000 62100000 gpos75 -band hs14 q23.2 q23.2 62100000 64800000 gneg -band hs14 q23.3 q23.3 64800000 67900000 gpos50 -band hs14 q24.1 q24.1 67900000 70200000 gneg -band hs14 q24.2 q24.2 70200000 73800000 gpos50 -band hs14 q24.3 q24.3 73800000 79300000 gneg -band hs14 q31.1 q31.1 79300000 83600000 gpos100 -band hs14 q31.2 q31.2 83600000 84900000 gneg -band hs14 q31.3 q31.3 84900000 89800000 gpos100 -band hs14 q32.11 q32.11 89800000 91900000 gneg -band hs14 q32.12 q32.12 91900000 94700000 gpos25 -band hs14 q32.13 q32.13 94700000 96300000 gneg -band hs14 q32.2 q32.2 96300000 101400000 gpos50 -band hs14 q32.31 q32.31 101400000 103200000 gneg -band hs14 q32.32 q32.32 103200000 104000000 gpos50 -band hs14 q32.33 q32.33 104000000 107349540 gneg -band hs15 p13 p13 0 3900000 gvar -band hs15 p12 p12 3900000 8700000 stalk -band hs15 p11.2 p11.2 8700000 15800000 gvar -band hs15 p11.1 p11.1 15800000 19000000 acen -band hs15 q11.1 q11.1 19000000 20700000 acen -band hs15 q11.2 q11.2 20700000 25700000 gneg -band hs15 q12 q12 25700000 28100000 gpos50 -band hs15 q13.1 q13.1 28100000 30300000 gneg -band hs15 q13.2 q13.2 30300000 31200000 gpos50 -band hs15 q13.3 q13.3 31200000 33600000 gneg -band hs15 q14 q14 33600000 40100000 gpos75 -band hs15 q15.1 q15.1 40100000 42800000 gneg -band hs15 q15.2 q15.2 42800000 43600000 gpos25 -band hs15 q15.3 q15.3 43600000 44800000 gneg -band hs15 q21.1 q21.1 44800000 49500000 gpos75 -band hs15 q21.2 q21.2 49500000 52900000 gneg -band hs15 q21.3 q21.3 52900000 59100000 gpos75 -band hs15 q22.1 q22.1 59100000 59300000 gneg -band hs15 q22.2 q22.2 59300000 63700000 gpos25 -band hs15 q22.31 q22.31 63700000 67200000 gneg -band hs15 q22.32 q22.32 67200000 67300000 gpos25 -band hs15 q22.33 q22.33 67300000 67500000 gneg -band hs15 q23 q23 67500000 72700000 gpos25 -band hs15 q24.1 q24.1 72700000 75200000 gneg -band hs15 q24.2 q24.2 75200000 76600000 gpos25 -band hs15 q24.3 q24.3 76600000 78300000 gneg -band hs15 q25.1 q25.1 78300000 81700000 gpos50 -band hs15 q25.2 q25.2 81700000 85200000 gneg -band hs15 q25.3 q25.3 85200000 89100000 gpos50 -band hs15 q26.1 q26.1 89100000 94300000 gneg -band hs15 q26.2 q26.2 94300000 98500000 gpos50 -band hs15 q26.3 q26.3 98500000 102531392 gneg -band hs16 p13.3 p13.3 0 7900000 gneg -band hs16 p13.2 p13.2 7900000 10500000 gpos50 -band hs16 p13.13 p13.13 10500000 12600000 gneg -band hs16 p13.12 p13.12 12600000 14800000 gpos50 -band hs16 p13.11 p13.11 14800000 16800000 gneg -band hs16 p12.3 p12.3 16800000 21200000 gpos50 -band hs16 p12.2 p12.2 21200000 24200000 gneg -band hs16 p12.1 p12.1 24200000 28100000 gpos50 -band hs16 p11.2 p11.2 28100000 34600000 gneg -band hs16 p11.1 p11.1 34600000 36600000 acen -band hs16 q11.1 q11.1 36600000 38600000 acen -band hs16 q11.2 q11.2 38600000 47000000 gvar -band hs16 q12.1 q12.1 47000000 52600000 gneg -band hs16 q12.2 q12.2 52600000 56700000 gpos50 -band hs16 q13 q13 56700000 57400000 gneg -band hs16 q21 q21 57400000 66700000 gpos100 -band hs16 q22.1 q22.1 66700000 70800000 gneg -band hs16 q22.2 q22.2 70800000 72900000 gpos50 -band hs16 q22.3 q22.3 72900000 74100000 gneg -band hs16 q23.1 q23.1 74100000 79200000 gpos75 -band hs16 q23.2 q23.2 79200000 81700000 gneg -band hs16 q23.3 q23.3 81700000 84200000 gpos50 -band hs16 q24.1 q24.1 84200000 87100000 gneg -band hs16 q24.2 q24.2 87100000 88700000 gpos25 -band hs16 q24.3 q24.3 88700000 90354753 gneg -band hs17 p13.3 p13.3 0 3300000 gneg -band hs17 p13.2 p13.2 3300000 6500000 gpos50 -band hs17 p13.1 p13.1 6500000 10700000 gneg -band hs17 p12 p12 10700000 16000000 gpos75 -band hs17 p11.2 p11.2 16000000 22200000 gneg -band hs17 p11.1 p11.1 22200000 24000000 acen -band hs17 q11.1 q11.1 24000000 25800000 acen -band hs17 q11.2 q11.2 25800000 31800000 gneg -band hs17 q12 q12 31800000 38100000 gpos50 -band hs17 q21.1 q21.1 38100000 38400000 gneg -band hs17 q21.2 q21.2 38400000 40900000 gpos25 -band hs17 q21.31 q21.31 40900000 44900000 gneg -band hs17 q21.32 q21.32 44900000 47400000 gpos25 -band hs17 q21.33 q21.33 47400000 50200000 gneg -band hs17 q22 q22 50200000 57600000 gpos75 -band hs17 q23.1 q23.1 57600000 58300000 gneg -band hs17 q23.2 q23.2 58300000 61100000 gpos75 -band hs17 q23.3 q23.3 61100000 62600000 gneg -band hs17 q24.1 q24.1 62600000 64200000 gpos50 -band hs17 q24.2 q24.2 64200000 67100000 gneg -band hs17 q24.3 q24.3 67100000 70900000 gpos75 -band hs17 q25.1 q25.1 70900000 74800000 gneg -band hs17 q25.2 q25.2 74800000 75300000 gpos25 -band hs17 q25.3 q25.3 75300000 81195210 gneg -band hs18 p11.32 p11.32 0 2900000 gneg -band hs18 p11.31 p11.31 2900000 7100000 gpos50 -band hs18 p11.23 p11.23 7100000 8500000 gneg -band hs18 p11.22 p11.22 8500000 10900000 gpos25 -band hs18 p11.21 p11.21 10900000 15400000 gneg -band hs18 p11.1 p11.1 15400000 17200000 acen -band hs18 q11.1 q11.1 17200000 19000000 acen -band hs18 q11.2 q11.2 19000000 25000000 gneg -band hs18 q12.1 q12.1 25000000 32700000 gpos100 -band hs18 q12.2 q12.2 32700000 37200000 gneg -band hs18 q12.3 q12.3 37200000 43500000 gpos75 -band hs18 q21.1 q21.1 43500000 48200000 gneg -band hs18 q21.2 q21.2 48200000 53800000 gpos75 -band hs18 q21.31 q21.31 53800000 56200000 gneg -band hs18 q21.32 q21.32 56200000 59000000 gpos50 -band hs18 q21.33 q21.33 59000000 61600000 gneg -band hs18 q22.1 q22.1 61600000 66800000 gpos100 -band hs18 q22.2 q22.2 66800000 68700000 gneg -band hs18 q22.3 q22.3 68700000 73100000 gpos25 -band hs18 q23 q23 73100000 78077248 gneg -band hs19 p13.3 p13.3 0 6900000 gneg -band hs19 p13.2 p13.2 6900000 13900000 gpos25 -band hs19 p13.13 p13.13 13900000 14000000 gneg -band hs19 p13.12 p13.12 14000000 16300000 gpos25 -band hs19 p13.11 p13.11 16300000 20000000 gneg -band hs19 p12 p12 20000000 24400000 gvar -band hs19 p11 p11 24400000 26500000 acen -band hs19 q11 q11 26500000 28600000 acen -band hs19 q12 q12 28600000 32400000 gvar -band hs19 q13.11 q13.11 32400000 35500000 gneg -band hs19 q13.12 q13.12 35500000 38300000 gpos25 -band hs19 q13.13 q13.13 38300000 38700000 gneg -band hs19 q13.2 q13.2 38700000 43400000 gpos25 -band hs19 q13.31 q13.31 43400000 45200000 gneg -band hs19 q13.32 q13.32 45200000 48000000 gpos25 -band hs19 q13.33 q13.33 48000000 51400000 gneg -band hs19 q13.41 q13.41 51400000 53600000 gpos25 -band hs19 q13.42 q13.42 53600000 56300000 gneg -band hs19 q13.43 q13.43 56300000 59128983 gpos25 -band hs2 p25.3 p25.3 0 4400000 gneg -band hs2 p25.2 p25.2 4400000 7100000 gpos50 -band hs2 p25.1 p25.1 7100000 12200000 gneg -band hs2 p24.3 p24.3 12200000 16700000 gpos75 -band hs2 p24.2 p24.2 16700000 19200000 gneg -band hs2 p24.1 p24.1 19200000 24000000 gpos75 -band hs2 p23.3 p23.3 24000000 27900000 gneg -band hs2 p23.2 p23.2 27900000 30000000 gpos25 -band hs2 p23.1 p23.1 30000000 32100000 gneg -band hs2 p22.3 p22.3 32100000 36600000 gpos75 -band hs2 p22.2 p22.2 36600000 38600000 gneg -band hs2 p22.1 p22.1 38600000 41800000 gpos50 -band hs2 p21 p21 41800000 47800000 gneg -band hs2 p16.3 p16.3 47800000 52900000 gpos100 -band hs2 p16.2 p16.2 52900000 55000000 gneg -band hs2 p16.1 p16.1 55000000 61300000 gpos100 -band hs2 p15 p15 61300000 64100000 gneg -band hs2 p14 p14 64100000 68600000 gpos50 -band hs2 p13.3 p13.3 68600000 71500000 gneg -band hs2 p13.2 p13.2 71500000 73500000 gpos50 -band hs2 p13.1 p13.1 73500000 75000000 gneg -band hs2 p12 p12 75000000 83300000 gpos100 -band hs2 p11.2 p11.2 83300000 90500000 gneg -band hs2 p11.1 p11.1 90500000 93300000 acen -band hs2 q11.1 q11.1 93300000 96800000 acen -band hs2 q11.2 q11.2 96800000 102700000 gneg -band hs2 q12.1 q12.1 102700000 106000000 gpos50 -band hs2 q12.2 q12.2 106000000 107500000 gneg -band hs2 q12.3 q12.3 107500000 110200000 gpos25 -band hs2 q13 q13 110200000 114400000 gneg -band hs2 q14.1 q14.1 114400000 118800000 gpos50 -band hs2 q14.2 q14.2 118800000 122400000 gneg -band hs2 q14.3 q14.3 122400000 129900000 gpos50 -band hs2 q21.1 q21.1 129900000 132500000 gneg -band hs2 q21.2 q21.2 132500000 135100000 gpos25 -band hs2 q21.3 q21.3 135100000 136800000 gneg -band hs2 q22.1 q22.1 136800000 142200000 gpos100 -band hs2 q22.2 q22.2 142200000 144100000 gneg -band hs2 q22.3 q22.3 144100000 148700000 gpos100 -band hs2 q23.1 q23.1 148700000 149900000 gneg -band hs2 q23.2 q23.2 149900000 150500000 gpos25 -band hs2 q23.3 q23.3 150500000 154900000 gneg -band hs2 q24.1 q24.1 154900000 159800000 gpos75 -band hs2 q24.2 q24.2 159800000 163700000 gneg -band hs2 q24.3 q24.3 163700000 169700000 gpos75 -band hs2 q31.1 q31.1 169700000 178000000 gneg -band hs2 q31.2 q31.2 178000000 180600000 gpos50 -band hs2 q31.3 q31.3 180600000 183000000 gneg -band hs2 q32.1 q32.1 183000000 189400000 gpos75 -band hs2 q32.2 q32.2 189400000 191900000 gneg -band hs2 q32.3 q32.3 191900000 197400000 gpos75 -band hs2 q33.1 q33.1 197400000 203300000 gneg -band hs2 q33.2 q33.2 203300000 204900000 gpos50 -band hs2 q33.3 q33.3 204900000 209000000 gneg -band hs2 q34 q34 209000000 215300000 gpos100 -band hs2 q35 q35 215300000 221500000 gneg -band hs2 q36.1 q36.1 221500000 225200000 gpos75 -band hs2 q36.2 q36.2 225200000 226100000 gneg -band hs2 q36.3 q36.3 226100000 231000000 gpos100 -band hs2 q37.1 q37.1 231000000 235600000 gneg -band hs2 q37.2 q37.2 235600000 237300000 gpos50 -band hs2 q37.3 q37.3 237300000 243199373 gneg -band hs20 p13 p13 0 5100000 gneg -band hs20 p12.3 p12.3 5100000 9200000 gpos75 -band hs20 p12.2 p12.2 9200000 12100000 gneg -band hs20 p12.1 p12.1 12100000 17900000 gpos75 -band hs20 p11.23 p11.23 17900000 21300000 gneg -band hs20 p11.22 p11.22 21300000 22300000 gpos25 -band hs20 p11.21 p11.21 22300000 25600000 gneg -band hs20 p11.1 p11.1 25600000 27500000 acen -band hs20 q11.1 q11.1 27500000 29400000 acen -band hs20 q11.21 q11.21 29400000 32100000 gneg -band hs20 q11.22 q11.22 32100000 34400000 gpos25 -band hs20 q11.23 q11.23 34400000 37600000 gneg -band hs20 q12 q12 37600000 41700000 gpos75 -band hs20 q13.11 q13.11 41700000 42100000 gneg -band hs20 q13.12 q13.12 42100000 46400000 gpos25 -band hs20 q13.13 q13.13 46400000 49800000 gneg -band hs20 q13.2 q13.2 49800000 55000000 gpos75 -band hs20 q13.31 q13.31 55000000 56500000 gneg -band hs20 q13.32 q13.32 56500000 58400000 gpos50 -band hs20 q13.33 q13.33 58400000 63025520 gneg -band hs21 p13 p13 0 2800000 gvar -band hs21 p12 p12 2800000 6800000 stalk -band hs21 p11.2 p11.2 6800000 10900000 gvar -band hs21 p11.1 p11.1 10900000 13200000 acen -band hs21 q11.1 q11.1 13200000 14300000 acen -band hs21 q11.2 q11.2 14300000 16400000 gneg -band hs21 q21.1 q21.1 16400000 24000000 gpos100 -band hs21 q21.2 q21.2 24000000 26800000 gneg -band hs21 q21.3 q21.3 26800000 31500000 gpos75 -band hs21 q22.11 q22.11 31500000 35800000 gneg -band hs21 q22.12 q22.12 35800000 37800000 gpos50 -band hs21 q22.13 q22.13 37800000 39700000 gneg -band hs21 q22.2 q22.2 39700000 42600000 gpos50 -band hs21 q22.3 q22.3 42600000 48129895 gneg -band hs22 p13 p13 0 3800000 gvar -band hs22 p12 p12 3800000 8300000 stalk -band hs22 p11.2 p11.2 8300000 12200000 gvar -band hs22 p11.1 p11.1 12200000 14700000 acen -band hs22 q11.1 q11.1 14700000 17900000 acen -band hs22 q11.21 q11.21 17900000 22200000 gneg -band hs22 q11.22 q11.22 22200000 23500000 gpos25 -band hs22 q11.23 q11.23 23500000 25900000 gneg -band hs22 q12.1 q12.1 25900000 29600000 gpos50 -band hs22 q12.2 q12.2 29600000 32200000 gneg -band hs22 q12.3 q12.3 32200000 37600000 gpos50 -band hs22 q13.1 q13.1 37600000 41000000 gneg -band hs22 q13.2 q13.2 41000000 44200000 gpos50 -band hs22 q13.31 q13.31 44200000 48400000 gneg -band hs22 q13.32 q13.32 48400000 49400000 gpos50 -band hs22 q13.33 q13.33 49400000 51304566 gneg -band hs3 p26.3 p26.3 0 2800000 gpos50 -band hs3 p26.2 p26.2 2800000 4000000 gneg -band hs3 p26.1 p26.1 4000000 8700000 gpos50 -band hs3 p25.3 p25.3 8700000 11800000 gneg -band hs3 p25.2 p25.2 11800000 13300000 gpos25 -band hs3 p25.1 p25.1 13300000 16400000 gneg -band hs3 p24.3 p24.3 16400000 23900000 gpos100 -band hs3 p24.2 p24.2 23900000 26400000 gneg -band hs3 p24.1 p24.1 26400000 30900000 gpos75 -band hs3 p23 p23 30900000 32100000 gneg -band hs3 p22.3 p22.3 32100000 36500000 gpos50 -band hs3 p22.2 p22.2 36500000 39400000 gneg -band hs3 p22.1 p22.1 39400000 43700000 gpos75 -band hs3 p21.33 p21.33 43700000 44100000 gneg -band hs3 p21.32 p21.32 44100000 44200000 gpos50 -band hs3 p21.31 p21.31 44200000 50600000 gneg -band hs3 p21.2 p21.2 50600000 52300000 gpos25 -band hs3 p21.1 p21.1 52300000 54400000 gneg -band hs3 p14.3 p14.3 54400000 58600000 gpos50 -band hs3 p14.2 p14.2 58600000 63700000 gneg -band hs3 p14.1 p14.1 63700000 69800000 gpos50 -band hs3 p13 p13 69800000 74200000 gneg -band hs3 p12.3 p12.3 74200000 79800000 gpos75 -band hs3 p12.2 p12.2 79800000 83500000 gneg -band hs3 p12.1 p12.1 83500000 87200000 gpos75 -band hs3 p11.2 p11.2 87200000 87900000 gneg -band hs3 p11.1 p11.1 87900000 91000000 acen -band hs3 q11.1 q11.1 91000000 93900000 acen -band hs3 q11.2 q11.2 93900000 98300000 gvar -band hs3 q12.1 q12.1 98300000 100000000 gneg -band hs3 q12.2 q12.2 100000000 100900000 gpos25 -band hs3 q12.3 q12.3 100900000 102800000 gneg -band hs3 q13.11 q13.11 102800000 106200000 gpos75 -band hs3 q13.12 q13.12 106200000 107900000 gneg -band hs3 q13.13 q13.13 107900000 111300000 gpos50 -band hs3 q13.2 q13.2 111300000 113500000 gneg -band hs3 q13.31 q13.31 113500000 117300000 gpos75 -band hs3 q13.32 q13.32 117300000 119000000 gneg -band hs3 q13.33 q13.33 119000000 121900000 gpos75 -band hs3 q21.1 q21.1 121900000 123800000 gneg -band hs3 q21.2 q21.2 123800000 125800000 gpos25 -band hs3 q21.3 q21.3 125800000 129200000 gneg -band hs3 q22.1 q22.1 129200000 133700000 gpos25 -band hs3 q22.2 q22.2 133700000 135700000 gneg -band hs3 q22.3 q22.3 135700000 138700000 gpos25 -band hs3 q23 q23 138700000 142800000 gneg -band hs3 q24 q24 142800000 148900000 gpos100 -band hs3 q25.1 q25.1 148900000 152100000 gneg -band hs3 q25.2 q25.2 152100000 155000000 gpos50 -band hs3 q25.31 q25.31 155000000 157000000 gneg -band hs3 q25.32 q25.32 157000000 159000000 gpos50 -band hs3 q25.33 q25.33 159000000 160700000 gneg -band hs3 q26.1 q26.1 160700000 167600000 gpos100 -band hs3 q26.2 q26.2 167600000 170900000 gneg -band hs3 q26.31 q26.31 170900000 175700000 gpos75 -band hs3 q26.32 q26.32 175700000 179000000 gneg -band hs3 q26.33 q26.33 179000000 182700000 gpos75 -band hs3 q27.1 q27.1 182700000 184500000 gneg -band hs3 q27.2 q27.2 184500000 186000000 gpos25 -band hs3 q27.3 q27.3 186000000 187900000 gneg -band hs3 q28 q28 187900000 192300000 gpos75 -band hs3 q29 q29 192300000 198022430 gneg -band hs4 p16.3 p16.3 0 4500000 gneg -band hs4 p16.2 p16.2 4500000 6000000 gpos25 -band hs4 p16.1 p16.1 6000000 11300000 gneg -band hs4 p15.33 p15.33 11300000 15200000 gpos50 -band hs4 p15.32 p15.32 15200000 17800000 gneg -band hs4 p15.31 p15.31 17800000 21300000 gpos75 -band hs4 p15.2 p15.2 21300000 27700000 gneg -band hs4 p15.1 p15.1 27700000 35800000 gpos100 -band hs4 p14 p14 35800000 41200000 gneg -band hs4 p13 p13 41200000 44600000 gpos50 -band hs4 p12 p12 44600000 48200000 gneg -band hs4 p11 p11 48200000 50400000 acen -band hs4 q11 q11 50400000 52700000 acen -band hs4 q12 q12 52700000 59500000 gneg -band hs4 q13.1 q13.1 59500000 66600000 gpos100 -band hs4 q13.2 q13.2 66600000 70500000 gneg -band hs4 q13.3 q13.3 70500000 76300000 gpos75 -band hs4 q21.1 q21.1 76300000 78900000 gneg -band hs4 q21.21 q21.21 78900000 82400000 gpos50 -band hs4 q21.22 q21.22 82400000 84100000 gneg -band hs4 q21.23 q21.23 84100000 86900000 gpos25 -band hs4 q21.3 q21.3 86900000 88000000 gneg -band hs4 q22.1 q22.1 88000000 93700000 gpos75 -band hs4 q22.2 q22.2 93700000 95100000 gneg -band hs4 q22.3 q22.3 95100000 98800000 gpos75 -band hs4 q23 q23 98800000 101100000 gneg -band hs4 q24 q24 101100000 107700000 gpos50 -band hs4 q25 q25 107700000 114100000 gneg -band hs4 q26 q26 114100000 120800000 gpos75 -band hs4 q27 q27 120800000 123800000 gneg -band hs4 q28.1 q28.1 123800000 128800000 gpos50 -band hs4 q28.2 q28.2 128800000 131100000 gneg -band hs4 q28.3 q28.3 131100000 139500000 gpos100 -band hs4 q31.1 q31.1 139500000 141500000 gneg -band hs4 q31.21 q31.21 141500000 146800000 gpos25 -band hs4 q31.22 q31.22 146800000 148500000 gneg -band hs4 q31.23 q31.23 148500000 151100000 gpos25 -band hs4 q31.3 q31.3 151100000 155600000 gneg -band hs4 q32.1 q32.1 155600000 161800000 gpos100 -band hs4 q32.2 q32.2 161800000 164500000 gneg -band hs4 q32.3 q32.3 164500000 170100000 gpos100 -band hs4 q33 q33 170100000 171900000 gneg -band hs4 q34.1 q34.1 171900000 176300000 gpos75 -band hs4 q34.2 q34.2 176300000 177500000 gneg -band hs4 q34.3 q34.3 177500000 183200000 gpos100 -band hs4 q35.1 q35.1 183200000 187100000 gneg -band hs4 q35.2 q35.2 187100000 191154276 gpos25 -band hs5 p15.33 p15.33 0 4500000 gneg -band hs5 p15.32 p15.32 4500000 6300000 gpos25 -band hs5 p15.31 p15.31 6300000 9800000 gneg -band hs5 p15.2 p15.2 9800000 15000000 gpos50 -band hs5 p15.1 p15.1 15000000 18400000 gneg -band hs5 p14.3 p14.3 18400000 23300000 gpos100 -band hs5 p14.2 p14.2 23300000 24600000 gneg -band hs5 p14.1 p14.1 24600000 28900000 gpos100 -band hs5 p13.3 p13.3 28900000 33800000 gneg -band hs5 p13.2 p13.2 33800000 38400000 gpos25 -band hs5 p13.1 p13.1 38400000 42500000 gneg -band hs5 p12 p12 42500000 46100000 gpos50 -band hs5 p11 p11 46100000 48400000 acen -band hs5 q11.1 q11.1 48400000 50700000 acen -band hs5 q11.2 q11.2 50700000 58900000 gneg -band hs5 q12.1 q12.1 58900000 62900000 gpos75 -band hs5 q12.2 q12.2 62900000 63200000 gneg -band hs5 q12.3 q12.3 63200000 66700000 gpos75 -band hs5 q13.1 q13.1 66700000 68400000 gneg -band hs5 q13.2 q13.2 68400000 73300000 gpos50 -band hs5 q13.3 q13.3 73300000 76900000 gneg -band hs5 q14.1 q14.1 76900000 81400000 gpos50 -band hs5 q14.2 q14.2 81400000 82800000 gneg -band hs5 q14.3 q14.3 82800000 92300000 gpos100 -band hs5 q15 q15 92300000 98200000 gneg -band hs5 q21.1 q21.1 98200000 102800000 gpos100 -band hs5 q21.2 q21.2 102800000 104500000 gneg -band hs5 q21.3 q21.3 104500000 109600000 gpos100 -band hs5 q22.1 q22.1 109600000 111500000 gneg -band hs5 q22.2 q22.2 111500000 113100000 gpos50 -band hs5 q22.3 q22.3 113100000 115200000 gneg -band hs5 q23.1 q23.1 115200000 121400000 gpos100 -band hs5 q23.2 q23.2 121400000 127300000 gneg -band hs5 q23.3 q23.3 127300000 130600000 gpos100 -band hs5 q31.1 q31.1 130600000 136200000 gneg -band hs5 q31.2 q31.2 136200000 139500000 gpos25 -band hs5 q31.3 q31.3 139500000 144500000 gneg -band hs5 q32 q32 144500000 149800000 gpos75 -band hs5 q33.1 q33.1 149800000 152700000 gneg -band hs5 q33.2 q33.2 152700000 155700000 gpos50 -band hs5 q33.3 q33.3 155700000 159900000 gneg -band hs5 q34 q34 159900000 168500000 gpos100 -band hs5 q35.1 q35.1 168500000 172800000 gneg -band hs5 q35.2 q35.2 172800000 176600000 gpos25 -band hs5 q35.3 q35.3 176600000 180915260 gneg -band hs6 p25.3 p25.3 0 2300000 gneg -band hs6 p25.2 p25.2 2300000 4200000 gpos25 -band hs6 p25.1 p25.1 4200000 7100000 gneg -band hs6 p24.3 p24.3 7100000 10600000 gpos50 -band hs6 p24.2 p24.2 10600000 11600000 gneg -band hs6 p24.1 p24.1 11600000 13400000 gpos25 -band hs6 p23 p23 13400000 15200000 gneg -band hs6 p22.3 p22.3 15200000 25200000 gpos75 -band hs6 p22.2 p22.2 25200000 27000000 gneg -band hs6 p22.1 p22.1 27000000 30400000 gpos50 -band hs6 p21.33 p21.33 30400000 32100000 gneg -band hs6 p21.32 p21.32 32100000 33500000 gpos25 -band hs6 p21.31 p21.31 33500000 36600000 gneg -band hs6 p21.2 p21.2 36600000 40500000 gpos25 -band hs6 p21.1 p21.1 40500000 46200000 gneg -band hs6 p12.3 p12.3 46200000 51800000 gpos100 -band hs6 p12.2 p12.2 51800000 52900000 gneg -band hs6 p12.1 p12.1 52900000 57000000 gpos100 -band hs6 p11.2 p11.2 57000000 58700000 gneg -band hs6 p11.1 p11.1 58700000 61000000 acen -band hs6 q11.1 q11.1 61000000 63300000 acen -band hs6 q11.2 q11.2 63300000 63400000 gneg -band hs6 q12 q12 63400000 70000000 gpos100 -band hs6 q13 q13 70000000 75900000 gneg -band hs6 q14.1 q14.1 75900000 83900000 gpos50 -band hs6 q14.2 q14.2 83900000 84900000 gneg -band hs6 q14.3 q14.3 84900000 88000000 gpos50 -band hs6 q15 q15 88000000 93100000 gneg -band hs6 q16.1 q16.1 93100000 99500000 gpos100 -band hs6 q16.2 q16.2 99500000 100600000 gneg -band hs6 q16.3 q16.3 100600000 105500000 gpos100 -band hs6 q21 q21 105500000 114600000 gneg -band hs6 q22.1 q22.1 114600000 118300000 gpos75 -band hs6 q22.2 q22.2 118300000 118500000 gneg -band hs6 q22.31 q22.31 118500000 126100000 gpos100 -band hs6 q22.32 q22.32 126100000 127100000 gneg -band hs6 q22.33 q22.33 127100000 130300000 gpos75 -band hs6 q23.1 q23.1 130300000 131200000 gneg -band hs6 q23.2 q23.2 131200000 135200000 gpos50 -band hs6 q23.3 q23.3 135200000 139000000 gneg -band hs6 q24.1 q24.1 139000000 142800000 gpos75 -band hs6 q24.2 q24.2 142800000 145600000 gneg -band hs6 q24.3 q24.3 145600000 149000000 gpos75 -band hs6 q25.1 q25.1 149000000 152500000 gneg -band hs6 q25.2 q25.2 152500000 155500000 gpos50 -band hs6 q25.3 q25.3 155500000 161000000 gneg -band hs6 q26 q26 161000000 164500000 gpos50 -band hs6 q27 q27 164500000 171115067 gneg -band hs7 p22.3 p22.3 0 2800000 gneg -band hs7 p22.2 p22.2 2800000 4500000 gpos25 -band hs7 p22.1 p22.1 4500000 7300000 gneg -band hs7 p21.3 p21.3 7300000 13800000 gpos100 -band hs7 p21.2 p21.2 13800000 16500000 gneg -band hs7 p21.1 p21.1 16500000 20900000 gpos100 -band hs7 p15.3 p15.3 20900000 25500000 gneg -band hs7 p15.2 p15.2 25500000 28000000 gpos50 -band hs7 p15.1 p15.1 28000000 28800000 gneg -band hs7 p14.3 p14.3 28800000 35000000 gpos75 -band hs7 p14.2 p14.2 35000000 37200000 gneg -band hs7 p14.1 p14.1 37200000 43300000 gpos75 -band hs7 p13 p13 43300000 45400000 gneg -band hs7 p12.3 p12.3 45400000 49000000 gpos75 -band hs7 p12.2 p12.2 49000000 50500000 gneg -band hs7 p12.1 p12.1 50500000 54000000 gpos75 -band hs7 p11.2 p11.2 54000000 58000000 gneg -band hs7 p11.1 p11.1 58000000 59900000 acen -band hs7 q11.1 q11.1 59900000 61700000 acen -band hs7 q11.21 q11.21 61700000 67000000 gneg -band hs7 q11.22 q11.22 67000000 72200000 gpos50 -band hs7 q11.23 q11.23 72200000 77500000 gneg -band hs7 q21.11 q21.11 77500000 86400000 gpos100 -band hs7 q21.12 q21.12 86400000 88200000 gneg -band hs7 q21.13 q21.13 88200000 91100000 gpos75 -band hs7 q21.2 q21.2 91100000 92800000 gneg -band hs7 q21.3 q21.3 92800000 98000000 gpos75 -band hs7 q22.1 q22.1 98000000 103800000 gneg -band hs7 q22.2 q22.2 103800000 104500000 gpos50 -band hs7 q22.3 q22.3 104500000 107400000 gneg -band hs7 q31.1 q31.1 107400000 114600000 gpos75 -band hs7 q31.2 q31.2 114600000 117400000 gneg -band hs7 q31.31 q31.31 117400000 121100000 gpos75 -band hs7 q31.32 q31.32 121100000 123800000 gneg -band hs7 q31.33 q31.33 123800000 127100000 gpos75 -band hs7 q32.1 q32.1 127100000 129200000 gneg -band hs7 q32.2 q32.2 129200000 130400000 gpos25 -band hs7 q32.3 q32.3 130400000 132600000 gneg -band hs7 q33 q33 132600000 138200000 gpos50 -band hs7 q34 q34 138200000 143100000 gneg -band hs7 q35 q35 143100000 147900000 gpos75 -band hs7 q36.1 q36.1 147900000 152600000 gneg -band hs7 q36.2 q36.2 152600000 155100000 gpos25 -band hs7 q36.3 q36.3 155100000 159138663 gneg -band hs8 p23.3 p23.3 0 2200000 gneg -band hs8 p23.2 p23.2 2200000 6200000 gpos75 -band hs8 p23.1 p23.1 6200000 12700000 gneg -band hs8 p22 p22 12700000 19000000 gpos100 -band hs8 p21.3 p21.3 19000000 23300000 gneg -band hs8 p21.2 p21.2 23300000 27400000 gpos50 -band hs8 p21.1 p21.1 27400000 28800000 gneg -band hs8 p12 p12 28800000 36500000 gpos75 -band hs8 p11.23 p11.23 36500000 38300000 gneg -band hs8 p11.22 p11.22 38300000 39700000 gpos25 -band hs8 p11.21 p11.21 39700000 43100000 gneg -band hs8 p11.1 p11.1 43100000 45600000 acen -band hs8 q11.1 q11.1 45600000 48100000 acen -band hs8 q11.21 q11.21 48100000 52200000 gneg -band hs8 q11.22 q11.22 52200000 52600000 gpos75 -band hs8 q11.23 q11.23 52600000 55500000 gneg -band hs8 q12.1 q12.1 55500000 61600000 gpos50 -band hs8 q12.2 q12.2 61600000 62200000 gneg -band hs8 q12.3 q12.3 62200000 66000000 gpos50 -band hs8 q13.1 q13.1 66000000 68000000 gneg -band hs8 q13.2 q13.2 68000000 70500000 gpos50 -band hs8 q13.3 q13.3 70500000 73900000 gneg -band hs8 q21.11 q21.11 73900000 78300000 gpos100 -band hs8 q21.12 q21.12 78300000 80100000 gneg -band hs8 q21.13 q21.13 80100000 84600000 gpos75 -band hs8 q21.2 q21.2 84600000 86900000 gneg -band hs8 q21.3 q21.3 86900000 93300000 gpos100 -band hs8 q22.1 q22.1 93300000 99000000 gneg -band hs8 q22.2 q22.2 99000000 101600000 gpos25 -band hs8 q22.3 q22.3 101600000 106200000 gneg -band hs8 q23.1 q23.1 106200000 110500000 gpos75 -band hs8 q23.2 q23.2 110500000 112100000 gneg -band hs8 q23.3 q23.3 112100000 117700000 gpos100 -band hs8 q24.11 q24.11 117700000 119200000 gneg -band hs8 q24.12 q24.12 119200000 122500000 gpos50 -band hs8 q24.13 q24.13 122500000 127300000 gneg -band hs8 q24.21 q24.21 127300000 131500000 gpos50 -band hs8 q24.22 q24.22 131500000 136400000 gneg -band hs8 q24.23 q24.23 136400000 139900000 gpos75 -band hs8 q24.3 q24.3 139900000 146364022 gneg -band hs9 p24.3 p24.3 0 2200000 gneg -band hs9 p24.2 p24.2 2200000 4600000 gpos25 -band hs9 p24.1 p24.1 4600000 9000000 gneg -band hs9 p23 p23 9000000 14200000 gpos75 -band hs9 p22.3 p22.3 14200000 16600000 gneg -band hs9 p22.2 p22.2 16600000 18500000 gpos25 -band hs9 p22.1 p22.1 18500000 19900000 gneg -band hs9 p21.3 p21.3 19900000 25600000 gpos100 -band hs9 p21.2 p21.2 25600000 28000000 gneg -band hs9 p21.1 p21.1 28000000 33200000 gpos100 -band hs9 p13.3 p13.3 33200000 36300000 gneg -band hs9 p13.2 p13.2 36300000 38400000 gpos25 -band hs9 p13.1 p13.1 38400000 41000000 gneg -band hs9 p12 p12 41000000 43600000 gpos50 -band hs9 p11.2 p11.2 43600000 47300000 gneg -band hs9 p11.1 p11.1 47300000 49000000 acen -band hs9 q11 q11 49000000 50700000 acen -band hs9 q12 q12 50700000 65900000 gvar -band hs9 q13 q13 65900000 68700000 gneg -band hs9 q21.11 q21.11 68700000 72200000 gpos25 -band hs9 q21.12 q21.12 72200000 74000000 gneg -band hs9 q21.13 q21.13 74000000 79200000 gpos50 -band hs9 q21.2 q21.2 79200000 81100000 gneg -band hs9 q21.31 q21.31 81100000 84100000 gpos50 -band hs9 q21.32 q21.32 84100000 86900000 gneg -band hs9 q21.33 q21.33 86900000 90400000 gpos50 -band hs9 q22.1 q22.1 90400000 91800000 gneg -band hs9 q22.2 q22.2 91800000 93900000 gpos25 -band hs9 q22.31 q22.31 93900000 96600000 gneg -band hs9 q22.32 q22.32 96600000 99300000 gpos25 -band hs9 q22.33 q22.33 99300000 102600000 gneg -band hs9 q31.1 q31.1 102600000 108200000 gpos100 -band hs9 q31.2 q31.2 108200000 111300000 gneg -band hs9 q31.3 q31.3 111300000 114900000 gpos25 -band hs9 q32 q32 114900000 117700000 gneg -band hs9 q33.1 q33.1 117700000 122500000 gpos75 -band hs9 q33.2 q33.2 122500000 125800000 gneg -band hs9 q33.3 q33.3 125800000 130300000 gpos25 -band hs9 q34.11 q34.11 130300000 133500000 gneg -band hs9 q34.12 q34.12 133500000 134000000 gpos25 -band hs9 q34.13 q34.13 134000000 135900000 gneg -band hs9 q34.2 q34.2 135900000 137400000 gpos25 -band hs9 q34.3 q34.3 137400000 141213431 gneg -band hsX p22.33 p22.33 0 4300000 gneg -band hsX p22.32 p22.32 4300000 6000000 gpos50 -band hsX p22.31 p22.31 6000000 9500000 gneg -band hsX p22.2 p22.2 9500000 17100000 gpos50 -band hsX p22.13 p22.13 17100000 19300000 gneg -band hsX p22.12 p22.12 19300000 21900000 gpos50 -band hsX p22.11 p22.11 21900000 24900000 gneg -band hsX p21.3 p21.3 24900000 29300000 gpos100 -band hsX p21.2 p21.2 29300000 31500000 gneg -band hsX p21.1 p21.1 31500000 37600000 gpos100 -band hsX p11.4 p11.4 37600000 42400000 gneg -band hsX p11.3 p11.3 42400000 46400000 gpos75 -band hsX p11.23 p11.23 46400000 49800000 gneg -band hsX p11.22 p11.22 49800000 54800000 gpos25 -band hsX p11.21 p11.21 54800000 58100000 gneg -band hsX p11.1 p11.1 58100000 60600000 acen -band hsX q11.1 q11.1 60600000 63000000 acen -band hsX q11.2 q11.2 63000000 64600000 gneg -band hsX q12 q12 64600000 67800000 gpos50 -band hsX q13.1 q13.1 67800000 71800000 gneg -band hsX q13.2 q13.2 71800000 73900000 gpos50 -band hsX q13.3 q13.3 73900000 76000000 gneg -band hsX q21.1 q21.1 76000000 84600000 gpos100 -band hsX q21.2 q21.2 84600000 86200000 gneg -band hsX q21.31 q21.31 86200000 91800000 gpos100 -band hsX q21.32 q21.32 91800000 93500000 gneg -band hsX q21.33 q21.33 93500000 98300000 gpos75 -band hsX q22.1 q22.1 98300000 102600000 gneg -band hsX q22.2 q22.2 102600000 103700000 gpos50 -band hsX q22.3 q22.3 103700000 108700000 gneg -band hsX q23 q23 108700000 116500000 gpos75 -band hsX q24 q24 116500000 120900000 gneg -band hsX q25 q25 120900000 128700000 gpos100 -band hsX q26.1 q26.1 128700000 130400000 gneg -band hsX q26.2 q26.2 130400000 133600000 gpos25 -band hsX q26.3 q26.3 133600000 138000000 gneg -band hsX q27.1 q27.1 138000000 140300000 gpos75 -band hsX q27.2 q27.2 140300000 142100000 gneg -band hsX q27.3 q27.3 142100000 147100000 gpos100 -band hsX q28 q28 147100000 155270560 gneg -band hsY p11.32 p11.32 0 2500000 gneg -band hsY p11.31 p11.31 2500000 3000000 gpos50 -band hsY p11.2 p11.2 3000000 11600000 gneg -band hsY p11.1 p11.1 11600000 12500000 acen -band hsY q11.1 q11.1 12500000 13400000 acen -band hsY q11.21 q11.21 13400000 15100000 gneg -band hsY q11.221 q11.221 15100000 19800000 gpos50 -band hsY q11.222 q11.222 19800000 22100000 gneg -band hsY q11.223 q11.223 22100000 26200000 gpos50 -band hsY q11.23 q11.23 26200000 28800000 gneg -band hsY q12 q12 28800000 59373566 gvar diff -r c284618dd8da -r 091714bd75a0 svdetect/BAM_preprocessingPairs.pl --- a/svdetect/BAM_preprocessingPairs.pl Tue Nov 06 10:06:26 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,340 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use warnings; -use Getopt::Std; -my $version = '0.4b_galaxy'; - -my $SAMTOOLS_BIN_DIR="/bioinfo/local/samtools"; - -my %opts = ( t=>1, p=>1, n=>1000000, f=>3, s=>0, S=>10000, o=>"." ); - -getopts('dt:p:n:f:s:S:o:b:l:x:N:', \%opts); #GALAXY - -my $working_dir=($opts{o} ne ".")? $opts{o}:"working directory"; - -my $pt_bad_mates_file=$opts{b}; #GALAXY -my $pt_log_file=$opts{l}; #GALAXY -my $pt_good_mates_file=$opts{x} if($opts{d}); #GALAXY - - -die(qq/ - -Description: - - Preprocessing of mates to get anomalously mapped mate-pair\/paired-end reads as input - for SVDetect. - - From all pairs mapped onto the reference genome, this script outputs abnormal pairs: - - mapped on two different chromosomes - - with an incorrect strand orientation and\/or pair order - - with an insert size distance +- sigma threshold - into a file sorted by read names - - -BAM\/SAM File input format only. - - Version : $version - SAMtools required for BAM files - - -Usage: BAM_preprocessingPairs.pl [options] - -Options: -t BOOLEAN read type: =1 (Illumina), =0 (SOLiD) [$opts{t}] - -p BOOLEAN pair type: =1 (paired-end), =0 (mate-pair) [$opts{p}] - -n INTEGER number of pairs for calculating mu and sigma lengths [$opts{n}] - -s INTEGER minimum value of ISIZE for calculating mu and sigma lengths [$opts{s}] - -S INTEGER maximum value of ISIZE for calculating mu and sigma lengths [$opts{S}] - -f REAL minimal number of sigma fold for filtering pairs [$opts{f}] - -d dump normal pairs into a file [] (optional) - -o STRING output directory [$working_dir] - -\n/) if (@ARGV == 0 && -t STDIN); - -unless (-d $opts{o}){ - mkdir $opts{o} or die; -} -$opts{o}.="/" if($opts{o}!~/\/$/); - -my $mates_file=shift(@ARGV); - -$mates_file=readlink($mates_file); - -my $bad_mates_file=(split(/\//,$mates_file))[$#_]; - -if($bad_mates_file=~/.(s|b)am$/){ - $bad_mates_file=~s/.(b|s)am$/.ab.sam/; - $bad_mates_file=$opts{o}.$bad_mates_file; -} - -else{ - die "Error: mate_file with the extension <.bam> or <.sam> needed !\n"; -} - -my $good_mates_file; -if($opts{d}){ - $good_mates_file=(split(/\//,$mates_file))[$#_]; - $good_mates_file=~s/.(b|s)am$/.norm.sam/; - $good_mates_file=$opts{o}.$good_mates_file; -} - -my $log_file=$opts{o}.$opts{N}.".svdetect_preprocessing.log"; #GALAXY - -#------------------------------------------------------------------------------# -#Calculate mu and sigma - -open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n"; - -print LOG "\# Calculating mu and sigma lengths...\n"; -print LOG "-- file=$mates_file\n"; -print LOG "-- n=$opts{n}\n"; -print LOG "-- ISIZE min=$opts{s}, max=$opts{S}\n"; - -my ($record, $sumX,$sumX2) = (0,0,0); -my $warn=$opts{n}/10; -my $prev_pair="FIRST"; - -my $bam=($mates_file =~ /.bam$/)? 1:0; - -if($bam){ - open(MATES, "${SAMTOOLS_BIN_DIR}/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; -}else{ - open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n"; -} - -while(){ - - my @t=split; - - next if ($t[0]=~/^@/); - - my $current_pair=$t[0]; - next if($current_pair eq $prev_pair); - $prev_pair=$current_pair; - - my ($chr1,$chr2,$length)=($t[2],$t[6],abs($t[8])); - - next if ($chr1 eq "*" || $chr2 eq "*"); - next if ($length<$opts{s} || $length>$opts{S}) ; - - if($chr2 eq "="){ - - $sumX += $length; #add to sum and sum^2 for mean and variance calculation - $sumX2 += $length*$length; - $record++; - } - - if($record>$warn){ - print LOG "-- $warn pairs analysed\n"; - $warn+=$warn; - } - - last if ($record>$opts{n}); - -} -close (MATES); - -$record--; -my $mu = $sumX/$record; -my $sigma = sqrt($sumX2/$record - $mu*$mu); - -print LOG "-- Total : $record pairs analysed\n"; -print LOG "-- mu length = ".decimal($mu,1).", sigma length = ".decimal($sigma,1)."\n"; - -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#Preprocessing pairs - -$warn=100000; - -$record=0; -my %count=( ab=>0, norm=>0, chr=>0, sense=>0, dist=>0, unmap=>0); - -my $read_type=($opts{t})? "Illumina":"SOLiD"; -my $pair_type=($opts{p})? "paired-end":"mate-paired"; - -print LOG "\# Preprocessing pairs...\n"; -print LOG "-- file= $mates_file\n"; -print LOG "-- type= $read_type $pair_type reads\n"; -print LOG "-- sigma threshold= $opts{f}\n"; -print LOG "-- using ".decimal($mu-$opts{f}*$sigma,4)."-".decimal($mu+$opts{f}*$sigma,4)." as normal range of insert size\n"; - -my @header; - -if($bam){ - open(HEADER, "${SAMTOOLS_BIN_DIR}/samtools view -H $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; - @header=
; - close HEADER; - open(MATES, "${SAMTOOLS_BIN_DIR}/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; -}else{ - open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n"; -} - -open AB, ">$bad_mates_file" or die "$0: can't write in the output: $bad_mates_file :$!\n"; -print AB @header if($bam); - -if($opts{d}){ - open NORM, ">$good_mates_file" or die "$0: can't write in the output: $good_mates_file :$!\n"; - print NORM @header if($bam); -} - -$prev_pair="FIRST"; -my $prev_bad; - -while(){ - - my @t=split; - my $bad=0; - - if ($t[0]=~/^@/){ - print AB; - print NORM if ($opts{d}); - next; - } - - my $current_pair=$t[0]; - if($current_pair eq $prev_pair){ - next if($prev_bad==-1); - if($prev_bad){ - print AB; - }elsif(!$prev_bad){ - print NORM if($opts{d}); - } - next; - } - - $prev_pair=$current_pair; - - my ($chr1,$chr2,$pos1,$pos2,$length)=($t[2],$t[6],$t[3],$t[7], abs($t[8])); - - if ($chr1 eq "*" || $chr2 eq "*"){ - $prev_bad=-1; - $count{unmap}++; - $record++; - next; - - } - - my $strand1 = (($t[1]&0x0010))? 'R':'F'; - my $strand2 = (($t[1]&0x0020))? 'R':'F'; - my $order1 = (($t[1]&0x0040))? '1':'2'; - my $order2 = (($t[1]&0x0080))? '1':'2'; - - if($order1 == 2){ - ($strand1,$strand2)=($strand2,$strand1); - ($chr1,$chr2)=($chr2,$chr1); - ($pos1,$pos2)=($pos2,$pos1); - ($order1,$order2)=($order2,$order1); - } - - my $sense=$strand1.$strand2; - - if($chr1 ne "=" && $chr2 ne "="){ - $bad=1; - $count{chr}++; - } - - if($opts{p}){ #paired-end - if(!(($sense eq "FR" && $pos1<$pos2) || ($sense eq "RF" && $pos2<$pos1))){ - $bad=1; - $count{sense}++; - } - }else{ #mate-pair - if($opts{t}){ #Illumina - if(!(($sense eq "FR" && $pos2<$pos1) || ($sense eq "RF" && $pos1<$pos2))){ - $bad=1; - $count{sense}++; - } - }else{ #SOLiD - if(!(($sense eq "FF" && $pos2<$pos1) || ($sense eq "RR" && $pos1<$pos2))){ - $bad=1; - $count{sense}++; - } - } - } - - if(($chr1 eq "=" || $chr2 eq "=") && ($length <$mu - $opts{f}*$sigma || $length>$mu + $opts{f}*$sigma)){ - $bad=1; - $count{dist}++; - } - - if($bad){ - print AB; - $count{ab}++; - $prev_bad=$bad; - }else{ - print NORM if ($opts{d}); - $count{norm}++; - $prev_bad=$bad; - } - - $record++; - - if($record>$warn){ - print LOG "-- $warn pairs analysed\n"; - $warn+=100000; - } -} - -close AB; -close NORM if($opts{d}); - -print LOG "-- Total : $record pairs analysed\n"; -print LOG "-- $count{unmap} pairs whose one or both reads are unmapped\n"; -print LOG "-- ".($count{ab}+$count{norm})." mapped pairs\n"; -print LOG "---- $count{ab} abnormal mapped pairs\n"; -print LOG "------ $count{chr} pairs mapped on two different chromosomes\n"; -print LOG "------ $count{sense} pairs with incorrect strand orientation and\/or pair order\n"; -print LOG "------ $count{dist} pairs with incorrect insert size distance\n"; -print LOG "--- $count{norm} correct mapped pairs\n"; - -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#OUTPUT - -if($bam){ - - my $bam_file=$bad_mates_file; - $bam_file=~s/.sam$/.bam/; - print LOG "\# Converting sam to bam for abnormal mapped pairs\n"; - system("${SAMTOOLS_BIN_DIR}/samtools view -bS $bad_mates_file > $bam_file 2>".$opts{o}."samtools.log"); - unlink($bad_mates_file); - print LOG "-- output created: $bam_file\n"; - - system "rm $pt_bad_mates_file ; ln -s $bam_file $pt_bad_mates_file"; #GALAXY - - if($opts{d}){ - $bam_file=$good_mates_file; - $bam_file=~s/.sam$/.bam/; - print LOG "\# Converting sam to bam for correct mapped pairs\n"; - system("${SAMTOOLS_BIN_DIR}/samtools view -bS $good_mates_file > $bam_file 2>".$opts{o}."samtools.log"); - unlink($good_mates_file); - print LOG "-- output created: $bam_file\n"; - - system "rm $pt_good_mates_file ; ln -s $bam_file $pt_good_mates_file"; #GALAXY - - } - -} - -else{ - print LOG "-- output created: $bad_mates_file\n"; - print LOG "-- output created: $good_mates_file\n" if($opts{d}); -} - -close LOG; - -system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY - - -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub decimal{ - - my $num=shift; - my $digs_to_cut=shift; - - $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/); - - return $num; -} -#------------------------------------------------------------------------------# diff -r c284618dd8da -r 091714bd75a0 svdetect/BAM_preprocessingPairs.xml --- a/svdetect/BAM_preprocessingPairs.xml Tue Nov 06 10:06:26 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,77 +0,0 @@ - - - to get abnormal pairs - - BAM_preprocessingPairs.pl -t '$readType' -p '$pairType' -n '$nbrePair' -s '$isizeMin' -S '$isizeMax' -f '$foldPair' -o $__new_file_path__/svdetect -b '$abBAM' -l '$log' -N $sample_name - #if $newBam.pairNormal=="yes" - -d -x '$normBAM' - #end if - '$inputBam' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - newBam['pairNormal'] == 'yes' - - - - - -**What it does** - -Bam_preprocessingPairs - Version 0.4b - -Preprocessing of mates to get anomalously mapped mate-pair/paired-end reads as input for SVDetect. - -From all pairs mapped onto the reference genome, this script outputs abnormal pairs: - - * mapped on two different chromosomes - * with an incorrect strand orientation and/or pair order - * with an insert size distance +- sigma threshold - -into a file prefix.ab.bam/sam sorted by read names - --BAM/SAM File input format only. - -SAMtools required for BAM files - ------ - -.. class:: infomark - -Contact Bruno Zeitouni (svdetect@curie.fr) for any questions or concerns about the Galaxy implementation of SVDetect. - - - - diff -r c284618dd8da -r 091714bd75a0 svdetect/SVDetect_compare.pl --- a/svdetect/SVDetect_compare.pl Tue Nov 06 10:06:26 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,716 +0,0 @@ -#!/usr/bin/perl -w - -=pod - -=head1 NAME - -SVDetect Compare for Galaxy - -Version: 0.8 for Galaxy - -=head1 SYNOPSIS - -SVDetect_compare.pl links2compare -conf [-help] [-man] - -=cut - -# ------------------------------------------------------------------- - -use strict; -use warnings; - -use Pod::Usage; -use Getopt::Long; - -use Config::General; -use Tie::IxHash; - -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# -#PARSE THE COMMAND LINE -my %OPT; -GetOptions(\%OPT, - 'conf=s', - 'out1=s', #GALAXY - 'out2=s', #GALAXY - 'out3=s', #GALAXY - 'out4=s', #GALAXY - 'out5=s', #GALAXY - 'out6=s', #GALAXY - 'out7=s', #GALAXY - 'out8=s', #GALAXY - 'out9=s', #GALAXY - 'l=s', #GALAXY - 'N=s', #GALAXY - 'help', - 'man' - ); - -pod2usage() if $OPT{help}; -pod2usage(-verbose=>2) if $OPT{man}; -pod2usage(-message=> "$!", -exitval => 2) if (!defined $OPT{conf}); - - -pod2usage() if(@ARGV<1); - -tie (my %func, 'Tie::IxHash',links2compare=>\&links2compare); - -foreach my $command (@ARGV){ - pod2usage(-message=> "Unknown command \"$command\"", -exitval => 2) if (!defined($func{$command})); -} -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# - -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# -#READ THE CONFIGURATION FILE -my $conf=Config::General->new( -ConfigFile => $OPT{conf}, - -Tie => "Tie::IxHash", - -AllowMultiOptions => 1, - -LowerCaseNames => 1, - -AutoTrue => 1); -my %CONF= $conf->getall; -validateconfiguration(\%CONF); #validation of the configuration parameters - - -my $SAMTOOLS_BIN_DIR="/bioinfo/local/samtools"; #GALAXY -my $BEDTOOLS_BIN_DIR="/bioinfo/local/BEDTools/bin"; #GALAXY - -my $pt_log_file=$OPT{l}; #GALAXY -my $log_file=$CONF{general}{output_dir}.$OPT{N}.".svdetect_compare.log"; #GALAXY -open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n";#GALAXY - -my @pt_sv_file=($OPT{out1},$OPT{out2},$OPT{out3}) if($OPT{out1}); #GALAXY common,sample,reference -my @pt_circos_file=($OPT{out4},$OPT{out5},$OPT{out6}) if($OPT{out4}); #GALAXY common,sample,reference -my @pt_bed_file=($OPT{out7},$OPT{out8},$OPT{out9}) if($OPT{out7}); #GALAXY common,sample,reference - -$CONF{compare}{sample_link_file}=readlink($CONF{compare}{sample_link_file});#GALAXY -$CONF{compare}{sample_link_file}=~s/.sv.txt//; #GALAXY - -$CONF{compare}{reference_link_file}=readlink($CONF{compare}{reference_link_file});#GALAXY -$CONF{compare}{reference_link_file}=~s/.sv.txt//; #GALAXY - -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# - -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# -#COMMAND EXECUTION -foreach my $command (@ARGV){ - &{$func{$command}}(); -} -print LOG "-- end\n"; - -close LOG;#GALAXY -system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY - -exit(0); -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# - -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# -#FUNCTIONS - -# -----------------------------------------------------------------------------# -#MAIN FUNCTION number 5:Comparison between samples, common or specific links -sub links2compare{ - - my @compare_files; - - compareSamples($CONF{general}{output_dir}, - $CONF{compare}{list_samples}, - $CONF{compare}{sample_link_file}, - $CONF{compare}{reference_link_file}, - $CONF{compare}{min_overlap}, - $CONF{compare}{same_sv_type}, - \@compare_files); - - my $pt_ind=0; - - for my $input_file (@compare_files){ - - $input_file=$CONF{general}{output_dir}.$input_file; - - my $output_file=$input_file; - $output_file=~s/unique$/compared/; - - sortLinks($input_file, $output_file,1); - - if($CONF{compare}{circos_output}){ - links2segdup($CONF{circos}{organism_id}, - $CONF{circos}{colorcode}, - $output_file, - $output_file.".segdup.txt"); - system "rm $pt_circos_file[$pt_ind]; ln -s $output_file.segdup.txt $pt_circos_file[$pt_ind]" if (defined $pt_circos_file[$pt_ind]); #GALAXY - } - - if($CONF{compare}{bed_output}){ - links2bedfile($CONF{compare}{read_lengths}, - $CONF{bed}{colorcode}, - $output_file, - $output_file.".bed"); - system "rm $pt_bed_file[$pt_ind]; ln -s $output_file.bed $pt_bed_file[$pt_ind]" if (defined $pt_bed_file[$pt_ind]); #GALAXY - } - - if($CONF{compare}{sv_output}){ - - links2SVfile ($output_file, $output_file.".sv.txt"); - system "rm $pt_sv_file[$pt_ind]; ln -s $output_file.sv.txt $pt_sv_file[$pt_ind]" if (defined $pt_sv_file[$pt_ind]); #GALAXY - } - $pt_ind++; - - } - unlink(@compare_files); - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub compareSamples{ - - my ($dir,$list_samples,$sample_file,$reference_file,$min_overlap,$same_sv_type,$file_names)=@_; - - my @bedpefiles; - my @list=split(",",$list_samples); - my @list_files=($sample_file,$reference_file); - - print LOG "\# Comparison procedure...\n"; - print LOG "-- samples=$list_samples\n". - "-- minimum overlap=$min_overlap\n". - "-- same SV type=$same_sv_type\n"; - - #conversion of links to bedPE format file - print LOG "-- Conversion of links.filtered files to bedPE format\n"; - for my $s (0..$#list) { - - links2bedPElinksfile($list[$s],$list_files[$s],$list_files[$s].".bedpe.txt"); - push(@bedpefiles,$list_files[$s].".bedpe.txt"); - - } - - #get common links between all samples compared - print LOG "-- Getting common links between all samples with BEDTools\n"; - my $common_name=join(".",@list); - - my $nb=scalar @list; - my $command=""; - my $prompt=">"; - - while ($nb>0){ - - for my $i (0..$#list_files){ - - $command.="$BEDTOOLS_BIN_DIR/pairToPair -type both -f $min_overlap -a ".$list_files[$i].".bedpe.txt"; - my $pipe=0; - - for my $j ($i+1..$#list_files){ - - $command.="| $BEDTOOLS_BIN_DIR/pairToPair -type both -f $min_overlap -a stdin" if($pipe); - $command.=" -b ".$list_files[$j].".bedpe.txt"; - $pipe=1; - - } - - $command.=$prompt.$dir.$common_name.".bedpe.tmp;"; - $prompt=">>"; - - my $first=shift(@list_files); push(@list_files,$first); - last; - } - $nb--; - } - - system ($command); - - push(@bedpefiles,$dir.$common_name.".bedpe.tmp"); - - #Post comparison to get common links if same type only (as an option) - open( FILE, "<".$dir.$common_name.".bedpe.tmp") or die "Can't open".$dir.$common_name.".bedpe.tmp : $!"; - open( OUT, ">".$dir.$common_name.".bedpe.unique") or die "Can't write in ".$dir.$common_name.".bedpe.unique : $!"; - - while(){ - my @t=split("\t",$_); - my $s=(split("_",$t[6]))[0]; - my ($sv1,$sv2)=($t[7],$t[18]); - splice(@t,11,$#t); - - if($same_sv_type){ - print OUT join("\t",@t)."\n" if($sv1 eq $sv2); - }else{ - print OUT join("\t",@t)."\n"; - } - } - close FILE; - close OUT; - - bedPElinks2linksfile($dir.$common_name.".bedpe.unique", $dir.$common_name.".unique"); - push(@bedpefiles,$dir.$common_name.".bedpe.unique"); - push(@$file_names,$common_name.".unique"); - print LOG "-- output created: ".$dir.$common_name.".compared\n"; - - #get specific links for each sample - print LOG "-- Getting specific links for each sample\n"; - for my $s (0..$#list) { - system("grep -Fxv -f ".$dir.$common_name.".bedpe.unique ".$list_files[$s].".bedpe.txt >".$dir.$list[$s].".bedpe.unique"); - bedPElinks2linksfile($dir.$list[$s].".bedpe.unique",$dir.$list[$s].".unique"); - push(@bedpefiles,$dir.$list[$s].".bedpe.unique"); - push(@$file_names,$list[$s].".unique"); - print LOG "-- output created: ".$dir.$list[$s].".compared\n"; - } - - unlink(@bedpefiles); - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#convert the links file to the circos format -sub links2segdup{ - - my($id,$color_code,$links_file,$segdup_file)=@_; - - print LOG "\# Converting to the circos format...\n"; - - tie (my %hcolor,'Tie::IxHash'); #color-code hash table - foreach my $col (keys %{$color_code}){ - my ($min_links,$max_links)=split(",",$color_code->{$col}); - $hcolor{$col}=[$min_links,$max_links]; - } - - open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n"; - open SEGDUP, ">$segdup_file" or die "$0: can't write in the output: $segdup_file :$!\n"; - - my $index=1; - while(){ - - my ($chr1,$start1,$end1,$chr2,$start2,$end2,$count)=(split)[0,1,2,3,4,5,6]; - - my $color=getColor($count,\%hcolor,"circos"); #get the color-code according the number of links - - print SEGDUP "$index\t$id$chr1\t$start1\t$end1\tcolor=$color\n". #circos output - "$index\t$id$chr2\t$start2\t$end2\tcolor=$color\n"; - $index++; - } - - close LINKS; - close SEGDUP; - print LOG "-- output created: $segdup_file\n"; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#convert the links file to the bedPE format for BEDTools usage -sub links2bedPElinksfile{ - - my ($sample,$links_file,$bedpe_file)=@_; - - open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n"; - open BEDPE, ">$bedpe_file" or die "$0: can't write in the output: $bedpe_file :$!\n"; - - my $nb_links=1; - - while(){ - - chomp; - my @t=split("\t",$_); - my ($chr1,$start1,$end1,$chr2,$start2,$end2)=splice(@t,0,6); - my $type=($chr1 eq $chr2)? "INTRA":"INTER"; - $type.="_".$t[10]; - - $start1--; $start2--; - - print BEDPE "$chr1\t$start1\t$end1\t$chr2\t$start2\t$end2". - "\t$sample"."_link$nb_links\t$type\t.\t.". - "\t".join("|",@t)."\n"; - - $nb_links++; - } - - close LINKS; - close BEDPE; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub bedPElinks2linksfile{ - - my ($bedpe_file,$links_file)=@_; - - open BEDPE, "<$bedpe_file" or die "$0: can't open: $bedpe_file :$!\n"; - open LINKS, ">$links_file" or die "$0: can't write in the output $links_file :$!\n"; - - while(){ - - chomp; - my $sample=(split("_",(split("\t",$_))[6]))[0]; - my @t1=(split("\t",$_))[0,1,2,3,4,5]; - my @t2=split(/\|/,(split("\t",$_))[10]); - push(@t2,$sample); - - print LINKS join("\t",@t1)."\t".join("\t",@t2)."\n"; - - } - close BEDPE; - close LINKS; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#convert the links file to the bed format -sub links2bedfile{ - - my ($tag_length,$color_code,$links_file,$bed_file)=@_; - - print LOG "\# Converting to the bed format...\n"; - - my $compare=1; - if($links_file!~/compared$/){ - $compare=0; - $tag_length->{none}->{1}=$tag_length->{1}; - $tag_length->{none}->{2}=$tag_length->{2}; - } - - #color-code hash table - tie (my %hcolor,'Tie::IxHash'); - my %color_order; - $color_order{"255,255,255"}=0; - my $n=1; - foreach my $col (keys %{$color_code}){ - my ($min_links,$max_links)=split(",",$color_code->{$col}); - $hcolor{$col}=[$min_links,$max_links]; - $color_order{$col}=$n; - $n++; - } - - my %pair; - my %pt; - $n=1; - open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n"; - - my %str=( "F"=>"+", "R"=>"-" ); - - while(){ - - my @t=split; - my $sample=($compare)? pop(@t):"none"; - - my $chr1=$t[0]; - my $chr2=$t[3]; - $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i); - $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i); - my $same_chr=($chr1 eq $chr2)? 1:0; - - my $count=$t[6]; - my $color=getColor($count,\%hcolor,"bed"); - - my @pairs=deleteBadOrderSensePairs(split(",",$t[7])); - my @strand1=deleteBadOrderSensePairs(split(",",$t[8])); - my @strand2=deleteBadOrderSensePairs(split(",",$t[9])); - my @ends_order1=deleteBadOrderSensePairs(split(",",$t[10])); - my @ends_order2=deleteBadOrderSensePairs(split(",",$t[11])); - my @position1=deleteBadOrderSensePairs(split(",",$t[14])); - my @position2=deleteBadOrderSensePairs(split(",",$t[15])); - my @start1; my @end1; getCoordswithLeftMost(\@start1,\@end1,\@position1,\@strand1,\@ends_order1,$tag_length->{$sample}); - my @start2; my @end2; getCoordswithLeftMost(\@start2,\@end2,\@position2,\@strand2,\@ends_order2,$tag_length->{$sample}); - - - for my $p (0..$#pairs){ - - if (!exists $pair{$pairs[$p]}){ - - if($same_chr){ - - $pair{$pairs[$p]}->{0}=[ $chr1, $start1[$p]-1, $end2[$p], $pairs[$p], 0, $str{$strand1[$p]}, - $start1[$p]-1, $end2[$p], $color, - 2, $tag_length->{$sample}->{$ends_order1[$p]}.",".$tag_length->{$sample}->{$ends_order2[$p]}, "0,".($start2[$p]-$start1[$p]) ]; - $pt{$n}=$pair{$pairs[$p]}->{0}; - $n++; - - }else{ - - $pair{$pairs[$p]}->{1}=[ $chr1, $start1[$p]-1, $end1[$p] , $pairs[$p]."/1", 0, $str{$strand1[$p]}, - $start1[$p]-1, $end1[$p], $color, - 1, $tag_length->{$sample}->{$ends_order1[$p]}, 0]; - $pt{$n}=$pair{$pairs[$p]}->{1}; - $n++; - - - $pair{$pairs[$p]}->{2}=[ $chr2, $start2[$p]-1, $end2[$p], $pairs[$p]."/2", 0, $str{$strand2[$p]}, - $start2[$p]-1, $end2[$p], $color, - 1, $tag_length->{$sample}->{$ends_order2[$p]}, 0]; - $pt{$n}=$pair{$pairs[$p]}->{2}; - $n++; - } - }else{ - - if($same_chr){ - ${$pair{$pairs[$p]}->{0}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{0}}[8]}); - }else{ - ${$pair{$pairs[$p]}->{1}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{1}}[8]}); - ${$pair{$pairs[$p]}->{2}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{2}}[8]}); - } - } - } - } - close LINKS; - - my $nb_pairs=$n-1; - - open BED, ">$bed_file" or die "$0: can't write in the output: $bed_file :$!\n"; - print BED "track name=\"$bed_file\" description=\"mate pairs involved in links\" ". - "visibility=2 itemRgb=\"On\"\n"; - - for my $i (1..$nb_pairs){ - print BED join("\t",@{$pt{$i}})."\n"; - } - - close BED; - - print LOG "-- output created: $bed_file\n"; - - undef %pair; - undef %pt; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub links2SVfile{ - - my($links_file,$sv_file)=@_; - - print LOG "\# Converting to the sv output table...\n"; - open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n"; - open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n"; - - my @header=qw(chr_type SV_type BAL_type chromosome1 start1-end1 average_dist - chromosome2 start2-end2 nb_pairs score_strand_filtering score_order_filtering score_insert_size_filtering - final_score breakpoint1_start1-end1 breakpoint2_start2-end2); - - my $nb_links=0; - - while (){ - - my @t=split; - my @sv=(); - my $sv_type="-"; - my $strand_ratio="-"; - my $eq_ratio="-"; - my $eq_type="-"; - my $insert_ratio="-"; - my $link="-"; - my ($bk1, $bk2)=("-","-"); - my $score="-"; - - my ($chr1,$start1,$end1)=($t[0],$t[1],$t[2]); - my ($chr2,$start2,$end2)=($t[3],$t[4],$t[5]); - my $nb_pairs=$t[6]; - $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i); - $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i); - my $chr_type=($chr1 eq $chr2)? "INTRA":"INTER"; - - #if strand filtering - if (defined $t[16]){ - #if inter-chr link - $sv_type=$t[16]; - if(defined $t[17] && $t[17]=~/^(\d+)\/(\d+)$/){ - $strand_ratio=floor($1/$2*100)."%"; - $score=$t[18]; - } - if(defined $t[18] && $t[18]=~/^(\d+)\/(\d+)$/){ - #if intra-chr link with insert size filtering - $strand_ratio=floor($1/$2*100)."%"; - $link=floor($t[17]); - if($sv_type!~/^INV/){ - $insert_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/); - $score=$t[20]; - }else{ - $score=$t[19]; - } - } - } - - if(defined $t[18] && ($t[18] eq "UNBAL" || $t[18] eq "BAL")){ - - #if strand and order filtering only and/or interchr link - $eq_type=$t[18]; - $eq_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/); - ($bk1, $bk2)=($t[20],$t[21]); - foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;} - $score=$t[22]; - - }elsif(defined $t[19] && ($t[19] eq "UNBAL" || $t[19] eq "BAL")){ - - #if all three filtering - $link=floor($t[17]); - $eq_type=$t[19]; - $eq_ratio=floor($1/$2*100)."%" if($t[20]=~/^(\d+)\/(\d+)$/); - - if(defined $t[21] && $t[21]=~/^(\d+)\/(\d+)$/){ - $insert_ratio=floor($1/$2*100)."%"; - ($bk1, $bk2)=($t[22],$t[23]); - $score=$t[24]; - - }else{ - ($bk1, $bk2)=($t[21],$t[22]); - $score=$t[23]; - } - foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;} - - } - - - push(@sv, $chr_type, $sv_type,$eq_type); - push(@sv,"$chr1\t$start1-$end1"); - push(@sv, $link); - push(@sv,"$chr2\t$start2-$end2", - $nb_pairs,$strand_ratio,$eq_ratio,$insert_ratio, decimal($score,4), $bk1, $bk2); - - - print SV join("\t",@sv)."\n"; - } - - close LINKS; - close SV; - - system "sort -k 9,9nr -k 13,13nr $sv_file > $sv_file.sorted"; - - open SV, "<".$sv_file.".sorted" or die "$0: can't open in the output: $sv_file".".sorted :$!\n"; - my @links=; - close SV; - - open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n"; - - print SV join("\t",@header)."\n"; - print SV @links; - close SV; - - unlink($sv_file.".sorted"); - - print LOG "-- output created: $sv_file\n"; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub deleteBadOrderSensePairs{ - - my (@tab)=@_; - my @tab2; - - foreach my $v (@tab){ - - $v=~s/[\(\)]//g; - push(@tab2,$v) if($v!~/[\$\*\@]$/); - } - return @tab2; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#gets starts and ends Coords when start=leftmost given positions, directions and orders -sub getCoordswithLeftMost { - - my ($starts,$ends,$positions,$strand,$end_order,$tag_length) = @_; - - for my $i (0..scalar(@{$positions})-1) { - - if($strand->[$i] eq 'F'){ - $starts->[$i]=$positions->[$i]; - $ends->[$i]=$positions->[$i]+$tag_length->{$end_order->[$i]}-1; - }else{ - $starts->[$i]=$positions->[$i]-$tag_length->{$end_order->[$i]}+1; - $ends->[$i]=$positions->[$i]; - } - } -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub floor { - my $nb = $_[0]; - $nb=~ s/\..*//; - return $nb; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub decimal{ - - my $num=shift; - my $digs_to_cut=shift; - - $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/); - - return $num; -} - -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#Sort links according the concerned chromosomes and their coordinates -sub sortLinks{ - - my ($links_file,$sortedlinks_file,$unique)=@_; - - print LOG "# Sorting links...\n"; - - my $pipe=($unique)? "| sort -u":""; - system "sort -k 1,1 -k 4,4 -k 2,2n -k 5,5n -k 8,8n $links_file $pipe > $sortedlinks_file"; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub getColor{ - - my($count,$hcolor,$format)=@_; - for my $col ( keys % { $hcolor} ) { - return $col if($count>=$hcolor->{$col}->[0] && $count<=$hcolor->{$col}->[1]); - } - return "white" if($format eq "circos"); - return "255,255,255" if($format eq "bed"); -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#check if the configuration file is correct -sub validateconfiguration{ - - my %conf=%{$_[0]}; - my $list_prgs="@ARGV"; - - my @circos_params=qw(organism_id colorcode); - my @bed_params=qw(colorcode); - my @compare_params=qw(list_samples list_read_lengths sample_link_file reference_link_file); - - unless (defined($conf{general}{output_dir})) { - $conf{general}{output_dir} = "."; - } - unless (-d $conf{general}{output_dir}){ - mkdir $conf{general}{output_dir} or die; - } - $conf{general}{output_dir}.="/" if($conf{general}{output_dir}!~/\/$/); - - - if($list_prgs=~/links2compare/){ - foreach my $p (@compare_params) { - die("Error Config : The compare parameter \"$p\" is not defined\n") if (!defined $conf{compare}{$p}); - } - - unless (defined($conf{compare}{same_sv_type})) { - $conf{compare}{same_sv_type} = 0; - } - - unless (defined($conf{compare}{min_overlap})) { - $conf{compare}{min_overlap} = 1E-9; - } - - if($conf{compare}{circos_output}){ - foreach my $p (@circos_params) { - next if($list_prgs=~/^ratio/ && $p eq "colorcode"); - die("Error Config : The circos parameter \"$p\" is not defined\n") if (!defined $conf{circos}{$p}); - } - } - if($conf{compare}{bed_output}){ - foreach my $p (@bed_params) { - die("Error Config : The bed parameter \"$p\" is not defined\n") if (!defined $conf{bed}{$p}); - } - die("Error Config : The compare parameter \"list_read_lengths\" is not defined\n") if (!defined $conf{compare}{list_read_lengths}); - - my @samples=split(",",$conf{compare}{list_samples}); - my @read_lengths=split(",",$conf{compare}{list_read_lengths}); - for my $i (0..$#samples){ - my @l=split("-",$read_lengths[$i]); - $conf{compare}{read_lengths}{$samples[$i]}={ 1=> $l[0], 2=> $l[1]}; - } - } - } - - -} -#------------------------------------------------------------------------------# -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# diff -r c284618dd8da -r 091714bd75a0 svdetect/SVDetect_compare.xml --- a/svdetect/SVDetect_compare.xml Tue Nov 06 10:06:26 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,218 +0,0 @@ - - -structural variants between two samples - -SVDetect_compare.pl links2compare -conf '$config_file' -l '$log_file' -N '$sample_name.$reference_name' - -#if $links2SV --out1 '$common_sv_file' --out2 '$sample_sv_file' --out3 '$reference_sv_file' -#end if - -#if $file_conversion.file_conversion_select=="convert" and $file_conversion.links2circos --out4 '$common_circos_file' --out5 '$sample_circos_file' --out6 '$reference_circos_file' -#end if - -#if $file_conversion.file_conversion_select=="convert" and $file_conversion.links2bed --out7 '$common_bed_file' --out8 '$sample_bed_file' --out9 '$reference_bed_file' -#end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - links2SV is True - - - links2SV is True - - - links2SV is True - - - - ( - file_conversion['file_conversion_select']=="convert" and - file_conversion['links2circos'] is True - ) - - - - ( - file_conversion['file_conversion_select']=="convert" and - file_conversion['links2circos'] is True - ) - - - - ( - file_conversion['file_conversion_select']=="convert" and - file_conversion['links2circos'] is True - ) - - - - - ( - file_conversion['file_conversion_select']=="convert" and - file_conversion['links2bed'] is True - ) - - - - ( - file_conversion['file_conversion_select']=="convert" and - file_conversion['links2bed'] is True - ) - - - - ( - file_conversion['file_conversion_select']=="convert" and - file_conversion['links2bed'] is True - ) - - - - - - - - - - -<general> -output_dir=$__new_file_path__/svdetect -</general> - -#if $file_conversion.file_conversion_select == "convert" -#if $file_conversion.links2circos -<circos> -organism_id=${file_conversion.organism_id} -<colorcode> -#for $color_repeat in $file_conversion.color_code -${color_repeat.color}=${color_repeat.interval} -#end for -</colorcode> -</circos> -#end if -#if $file_conversion.links2bed -<bed> -<colorcode> -#for $color_repeat in $file_conversion.color_code -#if str($color_repeat.color)== "grey" -190,190,190=${color_repeat.interval} -#end if -#if str($color_repeat.color)== "black" -0,0,0=${color_repeat.interval} -#end if -#if str($color_repeat.color)== "blue" -0,0,255=${color_repeat.interval} -#end if -#if str($color_repeat.color)== "green" -0,255,0=${color_repeat.interval} -#end if -#if str($color_repeat.color)== "purple" -153,50,205=${color_repeat.interval} -#end if -#if str($color_repeat.color)== "orange" -255,140,0=${color_repeat.interval} -#end if -#if str($color_repeat.color)== "red" -255,0,0=${color_repeat.interval} -#end if -#end for -</colorcode> -</bed> -#end if -#end if - -<compare> -list_samples=${sample_name},${reference_name} -list_read_lengths=${sample_read1_length}-${sample_read2_length},${reference_read1_length}-${reference_read2_length} -sample_link_file=${sample_mates_file} -reference_link_file=${reference_mates_file} -min_overlap=${min_overlap} -same_sv_type=${same_sv_type} -sv_output=${links2SV} -#if $file_conversion.file_conversion_select == "convert" -circos_output=${$file_conversion.links2circos} -bed_output=${$file_conversion.links2bed} -#end if -</compare> - - - - - -**What it does** - -SVDetect - Version : 0.8 - -Comparison of clusters between two samples to get common or sample-specific SVs - -This program is designed to compare filtered links between two anomalously mapped mate-pair/paired-end datasets -and to identify common and sample-specific SVs (like the usual sample/reference design). -Overlaps between coordinates of clusters and types of SVs are used as parameters of comparison. - -Manual documentation available at the http://svdetect.sourceforge.net/Site/Manual.html - ------ - -.. class:: infomark - -Contact Bruno Zeitouni (svdetect@curie.fr) for any questions or concerns about the Galaxy implementation of SVDetect. - - - diff -r c284618dd8da -r 091714bd75a0 svdetect/SVDetect_import.xml --- a/svdetect/SVDetect_import.xml Tue Nov 06 10:06:26 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,85 +0,0 @@ - - BAM, chromosome info or sv files - SVDetect_import.sh -i $file_path - #if str($type.file_type)=="bam" - -o $outbamfile - #elif str($type.file_type)=="len" - -o $outlenfile - #elif str($type.file_type)=="sv" - -o $outsvfile - #end if - - - - - - - - - - - - - - - - - - - - - - - - type['file_type']=="bam" - - - type['file_type']=="len" - - - type['file_type']=="sv" - - - -**What it does** - -This tool allows you to import quickly a BAM file, a chromosome info file or a SVDetect output file from you computer as inputs for SVDetect. - - -**Example of chromosome file** - -Input len file:: - - 1 chr1 247249719 - 2 chr2 242951149 - 3 chr3 199501827 - 4 chr4 191273063 - 5 chr5 180857866 - 6 chr6 170899992 - 7 chr7 158821424 - 8 chr8 146274826 - 9 chr9 140273252 - 10 chr10 135374737 - 11 chr11 134452384 - 12 chr12 132349534 - 13 chr13 114142980 - 14 chr14 106368585 - 15 chr15 100338915 - 16 chr16 88827254 - 17 chr17 78774742 - 18 chr18 76117153 - 19 chr19 63811651 - 20 chr20 62435964 - 21 chr21 46944323 - 22 chr22 49691432 - 23 chrX 154913754 - 24 chrY 57772954 - ------ - -.. class:: infomark - -Contact Bruno Zeitouni (svdetect@curie.fr) for any questions or concerns about the Galaxy implementation of SVDetect. - - - diff -r c284618dd8da -r 091714bd75a0 svdetect/SVDetect_run_parallel.pl --- a/svdetect/SVDetect_run_parallel.pl Tue Nov 06 10:06:26 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3537 +0,0 @@ -#!/usr/bin/perl -w - -=pod - -=head1 NAME - -SVDetect - Program designed to the detection of structural variations -from paired-end/mate-pair sequencing data, compatible with SOLiD and Illumina (>=1.3) reads - -Version: 0.8 for Galaxy - -=head1 SYNOPSIS - -SVDetect -conf [-help] [-man] - - Command: - - linking detection and isolation of links - filtering filtering of links according different parameters - links2circos links conversion to circos format - links2bed paired-ends of links converted to bed format (UCSC) - links2SV formatted output to show most significant SVs - cnv calculate copy-number profiles - ratio2circos ratio conversion to circos density format - ratio2bedgraph ratio conversion to bedGraph density format (UCSC) - -=head1 DESCRIPTION - -This is a command-line interface to SVDetect. - - -=head1 AUTHORS - -Bruno Zeitouni Ebruno.zeitouni@curie.frE, -Valentina Boeva Evalentina.boeva@curie.frE - -=cut - -# ------------------------------------------------------------------- - -use strict; -use warnings; - -use Pod::Usage; -use Getopt::Long; - -use Config::General; -use Tie::IxHash; -use FileHandle; -use Parallel::ForkManager; - -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# -#PARSE THE COMMAND LINE -my %OPT; -GetOptions(\%OPT, - 'conf=s', - 'out1=s', #GALAXY - 'out2=s', #GALAXY - 'out3=s', #GALAXY - 'out4=s', #GALAXY - 'out5=s', #GALAXY - 'l=s', #GALAXY - 'N=s',#GALAXY - 'help',#GALAXY - 'man' - ); - -pod2usage() if $OPT{help}; -pod2usage(-verbose=>2) if $OPT{man}; -pod2usage(-message=> "$!", -exitval => 2) if (!defined $OPT{conf}); - -pod2usage() if(@ARGV<1); - -tie (my %func, 'Tie::IxHash',linking=>\&createlinks, - filtering=>\&filterlinks, - links2circos=>\&links2circos, - links2bed=>\&links2bed, - links2compare=>\&links2compare, - links2SV=>\&links2SV, - cnv=>\&cnv, - ratio2circos=>\&ratio2circos, - ratio2bedgraph=>\&ratio2bedgraph); - -foreach my $command (@ARGV){ - pod2usage(-message=> "Unknown command \"$command\"", -exitval => 2) if (!defined($func{$command})); -} -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# - - -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# -#READ THE CONFIGURATION FILE -my $conf=Config::General->new( -ConfigFile => $OPT{conf}, - -Tie => "Tie::IxHash", - -AllowMultiOptions => 1, - -LowerCaseNames => 1, - -AutoTrue => 1); -my %CONF= $conf->getall; -validateconfiguration(\%CONF); #validation of the configuration parameters - -my $SAMTOOLS_BIN_DIR="/bioinfo/local/samtools"; #GALAXY - -my $pt_log_file=$OPT{l}; #GALAXY -my $pt_links_file=$OPT{out1} if($OPT{out1}); #GALAXY -my $pt_flinks_file=$OPT{out2} if($OPT{out2}); #GALAXY -my $pt_sv_file=$OPT{out3} if($OPT{out3}); #GALAXY -my $pt_circos_file=$OPT{out4} if($OPT{out4}); #GALAXY -my $pt_bed_file=$OPT{out5} if($OPT{out5}); #GALAXY - -$CONF{general}{mates_file}=readlink($CONF{general}{mates_file});#GALAXY -$CONF{general}{cmap_file}=readlink($CONF{general}{cmap_file});#GALAXY - -my $log_file=$CONF{general}{output_dir}.$OPT{N}.".svdetect_run.log"; #GALAXY -open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n";#GALAXY -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# - -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# -#COMMAND EXECUTION -foreach my $command (@ARGV){ - &{$func{$command}}(); -} -print LOG "-- end\n";#GALAXY - -close LOG;#GALAXY -system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY -exit(0); -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# - - -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# -#FUNCTIONS -#------------------------------------------------------------------------------# -#MAIN FUNCTION number 1: Detection of links from mate-pairs data -sub createlinks{ - - my %CHR; #main hash table 1: fragments, links - my %CHRID; - my @MATEFILES; - - my $output_prefix=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}; - my @path=split(/\//,$output_prefix); - $output_prefix=$CONF{general}{output_dir}.$path[$#path]; - my $tmp_mates_prefix=$CONF{general}{tmp_dir}."mates/".$path[$#path]; - my $tmp_links_prefix=$CONF{general}{tmp_dir}."links/".$path[$#path]; - - shearingChromosome(\%CHR, \%CHRID, #making the genomic fragment library with the detection parameters - $CONF{detection}{window_size}, - $CONF{detection}{step_length}, - $CONF{general}{cmap_file}); - - if($CONF{detection}{split_mate_file}){ - - splitMateFile(\%CHR, \%CHRID, \@MATEFILES, $tmp_mates_prefix, - $CONF{general}{sv_type}, - $CONF{general}{mates_file}, - $CONF{general}{input_format}, - $CONF{general}{read_lengths} - ); - }else{ - - @MATEFILES=qx{ls $tmp_mates_prefix*} or die "# Error: No splitted mate files already created at $CONF{general}{tmp_dir} :$!"; - chomp(@MATEFILES); - print LOG "# Splitted mate files already created.\n"; - } - - - #Parallelization of the linking per chromosome for intra + interchrs - my $pm = new Parallel::ForkManager($CONF{general}{num_threads}); - - foreach my $matefile (@MATEFILES){ - - my $pid = $pm->start and next; - getlinks(\%CHR, \%CHRID, $matefile); - $pm->finish; - - } - $pm->wait_all_children; - - #Merge the chromosome links file into only one - my @LINKFILES= qx{ls $tmp_links_prefix*links} or die "# Error: No links files created at $CONF{general}{tmp_dir} :$!"; - chomp(@LINKFILES); - catFiles( \@LINKFILES => "$output_prefix.links" ); - - system "rm $pt_links_file; ln -s $output_prefix.links $pt_links_file" if (defined $pt_links_file); #GALAXY - print LOG "# Linking end procedure : output created: $output_prefix.links\n"; - #unlink(@LINKFILES); - #unlink(@MATEFILES); - - undef %CHR; - undef %CHRID; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub getlinks { - - my ($chr,$chrID,$tmp_mates_prefix)=@_; - - my $tmp_links_prefix=$tmp_mates_prefix; - $tmp_links_prefix=~s/\/mates\//\/links\//; - - my %PAIR; #main hash table 2: pairs - - linking($chr,$chrID, \%PAIR, #creation of all links from chromosome coordinates of pairs - $CONF{general}{read_lengths}, - $CONF{detection}{window_size}, - $CONF{detection}{step_length}, - $tmp_mates_prefix, - $CONF{general}{input_format}, - $CONF{general}{sv_type}, - "$tmp_links_prefix.links.mapped" - ); - - getUniqueLinks("$tmp_links_prefix.links.mapped", #remove the doublons - "$tmp_links_prefix.links.unique"); - - defineCoordsLinks($chr,$chrID, \%PAIR, #definition of the precise coordinates of links - $CONF{general}{input_format}, - $CONF{general}{sv_type}, - $CONF{general}{read_lengths}, - "$tmp_links_prefix.links.unique", - "$tmp_links_prefix.links.unique_defined"); - - sortLinks("$tmp_links_prefix.links.unique_defined", #sorting links from coordinates - "$tmp_links_prefix.links.sorted"); - - removeFullyOverlappedLinks("$tmp_links_prefix.links.sorted", #remove redundant links - "$tmp_links_prefix.links",1); #file output - - - undef %PAIR; - - unlink("$tmp_links_prefix.links.mapped", - "$tmp_links_prefix.links.unique", - "$tmp_links_prefix.links.unique_defined", - "$tmp_links_prefix.links.sorted"); -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub splitMateFile{ - - my ($chr,$chrID,$files_list,$output_prefix,$sv_type,$mates_file,$input_format,$tag_length)=@_; - - print LOG "# Splitting the mate file \"$mates_file\" for parallel processing...\n"; - - my %filesHandle; - - #fichier matefile inter - if($sv_type=~/^(all|inter)$/){ - my $newFileName="$output_prefix.interchrs"; - push(@{$files_list},$newFileName); - my $fh = new FileHandle; - $fh->open(">$newFileName"); - $filesHandle{inter}=$fh; - } - - #fichiers matefiles intra - if($sv_type=~/^(all|intra)$/){ - foreach my $k (1..$chr->{nb_chrs}){ - my $newFileName=$output_prefix.".".$chr->{$k}->{name}; - push(@{$files_list},$newFileName); - my $fh = new FileHandle; - $fh->open(">$newFileName"); - $filesHandle{$k}=$fh; - } - } - - if ($mates_file =~ /.gz$/) { - open(MATES, "gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; #gzcat - }elsif($mates_file =~ /.bam$/){ - open(MATES, "$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY - }else{ - open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n"; - } - - - while(){ - - my @t=split; - my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1,$end_order_read2); - - next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2, \$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length)); - - next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2}); - - ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); - - if( ($sv_type=~/^(all|inter)$/) && ($chr_read1 ne $chr_read2) ){ - my $fh2print=$filesHandle{inter}; - print $fh2print join("\t",@t)."\n"; - } - - if( ($sv_type=~/^(all|intra)$/) && ($chr_read1 eq $chr_read2) ){ - my $fh2print=$filesHandle{$chr_read1}; - print $fh2print join("\t",@t)."\n"; - - } - } - - foreach my $name (keys %filesHandle){ - my $fh=$filesHandle{$name}; - $fh->close; - } - - print LOG "# Splitted mate files of \"$mates_file\" created.\n"; -} - - -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub filterlinks{ - - my %CHR; - my %CHRID; - my @LINKFILES; - my @FLINKFILES; - - my $output_prefix=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}; - my @path=split(/\//,$output_prefix); - $output_prefix=$CONF{general}{output_dir}.$path[$#path]; - my $tmp_links_prefix=$CONF{general}{tmp_dir}."links/".$path[$#path]; - - createChrHashTables(\%CHR,\%CHRID, - $CONF{general}{cmap_file}); - - if($CONF{filtering}{split_link_file}){ - - splitLinkFile(\%CHR, \%CHRID, \@LINKFILES, - $tmp_links_prefix, - $CONF{general}{sv_type}, - "$output_prefix.links", - ); - }else{ - - @LINKFILES=qx{ls $tmp_links_prefix*links} or die "# Error: No splitted link files already created\n"; - chomp(@LINKFILES); - print LOG "# Splitted link files already created.\n"; - } - - #Parallelization of the filtering per chromosome for intra + interchrs - my $pm = new Parallel::ForkManager($CONF{general}{num_threads}); - - foreach my $linkfile (@LINKFILES){ - - my $pid = $pm->start and next; - getFilteredlinks(\%CHR, \%CHRID, $linkfile); - $pm->finish; - - } - $pm->wait_all_children; - - #Merge the chromosome links file into only one - @FLINKFILES= qx{ls $tmp_links_prefix*filtered} or die "# Error: No links files created\n"; - chomp(@FLINKFILES); - catFiles( \@FLINKFILES => "$output_prefix.links.filtered" ); - - system "rm $pt_flinks_file; ln -s $output_prefix.links.filtered $pt_flinks_file" if (defined $pt_flinks_file); #GALAXY - print LOG"# Filtering end procedure : output created: $output_prefix.links.filtered\n"; - - undef %CHR; - undef %CHRID; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub splitLinkFile{ - - my ($chr,$chrID,$files_list,$input_prefix,$sv_type,$link_file)=@_; - - print LOG "# Splitting the link file for parallel processing...\n"; - - my %filesHandle; - - #fichier matefile inter - if($sv_type=~/^(all|inter)$/){ - my $newFileName="$input_prefix.interchrs.links"; - push(@{$files_list},$newFileName); - my $fh = new FileHandle; - $fh->open(">$newFileName"); - $filesHandle{inter}=$fh; - } - - #fichiers matefiles intra - if($sv_type=~/^(all|intra)$/){ - foreach my $k (1..$chr->{nb_chrs}){ - my $newFileName=$input_prefix.".".$chr->{$k}->{name}.".links"; - push(@{$files_list},$newFileName); - my $fh = new FileHandle; - $fh->open(">$newFileName"); - $filesHandle{$k}=$fh; - } - } - - open LINKS, "<".$link_file or die "$0: can't open ".$link_file.":$!\n"; - while(){ - - my @t=split; - my ($chr_read1,$chr_read2)=($t[0],$t[3]); - - next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2}); - - ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); - - if( ($sv_type=~/^(all|inter)$/) && ($chr_read1 ne $chr_read2) ){ - my $fh2print=$filesHandle{inter}; - print $fh2print join("\t",@t)."\n"; - } - - if( ($sv_type=~/^(all|intra)$/) && ($chr_read1 eq $chr_read2) ){ - my $fh2print=$filesHandle{$chr_read1}; - print $fh2print join("\t",@t)."\n"; - - } - } - - foreach my $name (keys %filesHandle){ - my $fh=$filesHandle{$name}; - $fh->close; - } - - print LOG "# Splitted link files created.\n"; -} - - -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#MAIN FUNCTION number 2: Filtering processing -sub getFilteredlinks { - - my ($chr,$chrID,$tmp_links_prefix)=@_; - my %PAIR; - - strandFiltering($chr,$chrID, - $CONF{filtering}{nb_pairs_threshold}, #filtering of links - $CONF{filtering}{strand_filtering}, - $CONF{filtering}{chromosomes}, - $CONF{general}{input_format}, - $CONF{general}{cmap_file}, - $CONF{general}{mates_orientation}, - $CONF{general}{read_lengths}, - $tmp_links_prefix, - "$tmp_links_prefix.filtered", - ); - - if($CONF{filtering}{strand_filtering}){ #re-definition of links coordinates with strand filtering - - my @tmpfiles; - - rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_unique"); - - getUniqueLinks("$tmp_links_prefix.filtered_unique", - "$tmp_links_prefix.filtered"); - - push(@tmpfiles,"$tmp_links_prefix.filtered_unique"); - - if($CONF{filtering}{order_filtering}){ #filtering using the order - - rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_ordered"); - - orderFiltering($chr,$chrID, - $CONF{filtering}{nb_pairs_threshold}, - $CONF{filtering}{nb_pairs_order_threshold}, - $CONF{filtering}{mu_length}, - $CONF{filtering}{sigma_length}, - $CONF{general}{mates_orientation}, - $CONF{general}{read_lengths}, - "$tmp_links_prefix.filtered_ordered", - "$tmp_links_prefix.filtered", - ); - - push(@tmpfiles,"$tmp_links_prefix.filtered_ordered"); - } - - if (($CONF{filtering}{insert_size_filtering})&& - ($CONF{general}{sv_type} ne 'inter')){ - - rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_withoutIndelSize"); - - addInsertionInfo($chr,$chrID, - $CONF{filtering}{nb_pairs_threshold}, - $CONF{filtering}{order_filtering}, - $CONF{filtering}{indel_sigma_threshold}, - $CONF{filtering}{dup_sigma_threshold}, - $CONF{filtering}{singleton_sigma_threshold}, - $CONF{filtering}{mu_length}, - $CONF{filtering}{sigma_length}, - $CONF{general}{mates_orientation}, - $CONF{general}{read_lengths}, - "$tmp_links_prefix.filtered_withoutIndelSize", - "$tmp_links_prefix.filtered" - ); - - push(@tmpfiles,"$tmp_links_prefix.filtered_withoutIndelSize"); - } - - sortLinks("$tmp_links_prefix.filtered", - "$tmp_links_prefix.filtered_sorted"); - - removeFullyOverlappedLinks("$tmp_links_prefix.filtered_sorted", - "$tmp_links_prefix.filtered_nodup", - ); - - postFiltering("$tmp_links_prefix.filtered_nodup", - "$tmp_links_prefix.filtered", - $CONF{filtering}{final_score_threshold}); - - push(@tmpfiles,"$tmp_links_prefix.filtered_sorted","$tmp_links_prefix.filtered_nodup"); - - unlink(@tmpfiles); - - - } - undef %PAIR; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#MAIN FUNCTION number 3: Circos format conversion for links -sub links2circos{ - - my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered"; - my @path=split(/\//,$input_file); - $input_file=$CONF{general}{output_dir}.$path[$#path]; - - my $output_file.=$input_file.".segdup.txt"; - - links2segdup($CONF{circos}{organism_id}, - $CONF{circos}{colorcode}, - $input_file, - $output_file); #circos file output - - system "rm $pt_circos_file; ln -s $output_file $pt_circos_file" if (defined $pt_circos_file); #GALAXY -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#MAIN FUNCTION number 4: Bed format conversion for links -sub links2bed{ - - my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered"; - my @path=split(/\//,$input_file); - $input_file=$CONF{general}{output_dir}.$path[$#path]; - - my $output_file.=$input_file.".bed.txt"; - - links2bedfile($CONF{general}{read_lengths}, - $CONF{bed}{colorcode}, - $input_file, - $output_file); #bed file output - - system "rm $pt_bed_file; ln -s $output_file $pt_bed_file" if (defined $pt_bed_file); #GALAXY - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#MAIN FUNCTION number 6: Bed format conversion for links -sub links2SV{ - - my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered"; - - my @path=split(/\//,$input_file); - $input_file=$CONF{general}{output_dir}.$path[$#path]; - - my $output_file.=$input_file.".sv.txt"; - - - links2SVfile( $input_file, - $output_file); - - system "rm $pt_sv_file; ln -s $output_file $pt_sv_file" if (defined $pt_sv_file); #GALAXY -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#MAIN FUNCTION number 7: copy number variations, coverage ratio calculation -sub cnv{ - - my %CHR; - my %CHRID; - my @MATEFILES; - my @MATEFILES_REF; - - my $output_prefix=$CONF{general}{mates_file}; - my $output_prefix_ref=$CONF{detection}{mates_file_ref}; - my @path=split(/\//,$output_prefix); - my @path_ref=split(/\//,$output_prefix_ref); - $output_prefix=$CONF{general}{output_dir}.$path[$#path]; - $output_prefix_ref=$CONF{general}{output_dir}.$path_ref[$#path_ref]; - my $tmp_mates_prefix=$CONF{general}{tmp_dir}."mates/".$path[$#path]; - my $tmp_mates_prefix_ref=$CONF{general}{tmp_dir}."mates/".$path_ref[$#path_ref]; - my $tmp_density_prefix=$CONF{general}{tmp_dir}."density/".$path[$#path]; - - shearingChromosome(\%CHR, \%CHRID, #making the genomic fragment library with the detection parameters - $CONF{detection}{window_size}, - $CONF{detection}{step_length}, - $CONF{general}{cmap_file}); - - if($CONF{detection}{split_mate_file}){ - - splitMateFile(\%CHR, \%CHRID, \@MATEFILES, $tmp_mates_prefix, - "intra", - $CONF{general}{mates_file}, - $CONF{general}{input_format}, - $CONF{general}{read_lengths} - ); - - splitMateFile(\%CHR, \%CHRID, \@MATEFILES_REF, $tmp_mates_prefix_ref, - "intra", - $CONF{detection}{mates_file_ref}, - $CONF{general}{input_format}, - $CONF{general}{read_lengths} - ); - - - }else{ - - @MATEFILES=qx{ls $tmp_mates_prefix*} or die "# Error: No splitted sample mate files of \"$CONF{general}{mates_file}\" already created at $CONF{general}{tmp_dir} :$!"; - chomp(@MATEFILES); - @MATEFILES_REF=qx{ls $tmp_mates_prefix_ref*} or die "# Error: No splitted reference mate files of \"$CONF{detection}{mates_file_ref}\" already created at $CONF{general}{tmp_dir} :$!"; - chomp(@MATEFILES_REF); - print LOG "# Splitted sample and reference mate files already created.\n"; - } - - #Parallelization of the cnv per chromosome - my $pm = new Parallel::ForkManager($CONF{general}{num_threads}); - - foreach my $file (0..$#MATEFILES){ - - my $pid = $pm->start and next; - - densityCalculation(\%CHR, \%CHRID, $file, - $CONF{general}{read_lengths}, - $CONF{detection}{window_size}, - $CONF{detection}{step_length}, - \@MATEFILES, - \@MATEFILES_REF, - $MATEFILES[$file].".density", - $CONF{general}{input_format}); - - $pm->finish; - - } - $pm->wait_all_children; - - #Merge the chromosome links file into only one - my @DENSITYFILES= qx{ls $tmp_density_prefix*density} or die "# Error: No density files created at $CONF{general}{tmp_dir} :$!"; - chomp(@DENSITYFILES); - catFiles( \@DENSITYFILES => "$output_prefix.density" ); - - print LOG "# cnv end procedure : output created: $output_prefix.density\n"; - - - undef %CHR; - undef %CHRID; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#MAIN FUNCTION number 8: Circos format conversion for cnv ratios -sub ratio2circos{ - - my $input_file=$CONF{general}{mates_file}.".density"; - - my @path=split(/\//,$input_file); - $input_file=$CONF{general}{output_dir}.$path[$#path]; - - my $output_file.=$input_file.".segdup.txt"; - - ratio2segdup($CONF{circos}{organism_id}, - $input_file, - $output_file); -} -#------------------------------------------------------------------------------# -#MAIN FUNCTION number 9: BedGraph format conversion for cnv ratios -sub ratio2bedgraph{ - - my $input_file=$CONF{general}{mates_file}.".density"; - - my @path=split(/\//,$input_file); - $input_file=$CONF{general}{output_dir}.$path[$#path]; - - my $output_file.=$input_file.".bedgraph.txt"; - - ratio2bedfile($input_file, - $output_file); #bed file output -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#Creation of the fragment library -sub shearingChromosome{ - - print LOG "# Making the fragments library...\n"; - - my ($chr,$chrID,$window,$step,$cmap_file)=@_; #window and step sizes parameters - - createChrHashTables($chr,$chrID,$cmap_file); #hash tables: chromosome ID <=> chromsomes Name - - foreach my $k (1..$chr->{nb_chrs}){ - - print LOG"-- $chr->{$k}->{name}\n"; - - my $frag=1; - for (my $start=0; $start<$chr->{$k}->{length}; $start+=$step){ - - my $end=($start<($chr->{$k}->{length})-$window)? $start+$window-1:($chr->{$k}->{length})-1; - $chr->{$k}->{$frag}=[$start,$end]; #creation of fragments, coordinates storage - - if($end==($chr->{$k}->{length})-1){ - $chr->{$k}->{nb_frag}=$frag; #nb of fragments per chromosome - last; - } - $frag++; - } - } -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#Creation of chromosome hash tables from the cmap file -sub createChrHashTables{ - - my ($chr,$chrID,$cmap_file)=@_; - $chr->{nb_chrs}=0; - - open CMAP, "<".$cmap_file or die "$0: can't open ".$cmap_file.":$!\n"; - while(){ - - if(/^\s+$/){ next;} - my ($k,$name,$length) = split; - $chr->{$k}->{name}=$name; - $chr->{$k}->{length}=$length; - $chrID->{$name}=$k; - $chr->{nb_chrs}++; - - } - close CMAP; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#Read the mate file according the input format file (solid, eland or sam) -sub readMateFile{ - - my ($chr1,$chr2,$pos1,$pos2,$order1,$order2,$t,$file_type,$tag_length)=@_; - my ($strand1,$strand2); - - if($file_type eq "solid"){ - - ($$chr1,$$chr2,$$pos1,$$pos2,$$order1,$$order2)=($$t[6],$$t[7],$$t[8]+1,$$t[9]+1,1,2); #0-based - - }else{ - my ($tag_length1,$tag_length2); - ($$chr1,$$chr2,$$pos1,$strand1,$$pos2,$strand2,$$order1,$$order2,$tag_length1,$tag_length2)=($$t[11],$$t[12],$$t[7],$$t[8],$$t[9],$$t[10],1,2,length($$t[1]),length($$t[2])) #1-based - if($file_type eq "eland"); - - if($file_type eq "sam"){ - - return 0 if ($$t[0]=~/^@/); #header sam filtered out - - ($$chr1,$$chr2,$$pos1,$$pos2)=($$t[2],$$t[6],$$t[3],$$t[7]); - - return 0 if ($$chr1 eq "*" || $$chr2 eq "*"); - - $$chr2=$$chr1 if($$chr2 eq "="); - - $strand1 = (($$t[1]&0x0010))? 'R':'F'; - $strand2 = (($$t[1]&0x0020))? 'R':'F'; - - $$order1= (($$t[1]&0x0040))? '1':'2'; - $$order2= (($$t[1]&0x0080))? '1':'2'; - $tag_length1 = $tag_length->{$$order1}; - $tag_length2 = $tag_length->{$$order2}; - } - - $$pos1 = -($$pos1+$tag_length1) if ($strand1 eq "R"); #get sequencing starts - $$pos2 = -($$pos2+$tag_length2) if ($strand2 eq "R"); - } - return 1; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#Parsing of the mates files and creation of links between 2 chromosomal fragments -sub linking{ - - my ($chr,$chrID,$pair,$tag_length,$window_dist,$step,$mates_file,$input_format,$sv_type,$links_file)=@_; - my %link; - - my $record=0; - my $nb_links=0; - my $warn=10000; - - my @sfile=split(/\./,$mates_file); - my $fchr=$sfile[$#sfile]; - - my $fh = new FileHandle; - - print LOG "# $fchr : Linking procedure...\n"; - print LOG "-- file=$mates_file\n". - "-- chromosome=$fchr\n". - "-- input format=$input_format\n". - "-- type=$sv_type\n". - "-- read1 length=$tag_length->{1}, read2 length=$tag_length->{2}\n". - "-- window size=$window_dist, step length=$step\n"; - - if ($mates_file =~ /.gz$/) { - $fh->open("gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; #gzcat - }elsif($mates_file =~ /.bam$/){ - $fh->open("$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY - }else{ - $fh->open("<".$mates_file) or die "$0: can't open ".$mates_file.":$!\n"; - } - - - while(<$fh>){ - - my @t=split; #for each mate-pair - my $mate=$t[0]; - my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1,$end_order_read2); - - next if(exists $$pair{$mate}); - - next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2, \$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length)); - - next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2}); - - ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); - - if($sv_type ne "all"){ - if( ($sv_type eq "inter") && ($chr_read1 ne $chr_read2) || - ($sv_type eq "intra") && ($chr_read1 eq $chr_read2) ){ - }else{ - next; - } - } - - $$pair{$mate}=[$chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1, $end_order_read2 ]; #fill out the hash pair table (ready for the defineCoordsLinks function) - - $record++; - - my ($coord_start_read1,$coord_end_read1,$coord_start_read2,$coord_end_read2); #get the coordinates of each read - - recupCoords($firstbase_read1,\$coord_start_read1,\$coord_end_read1,$tag_length->{$end_order_read1},$input_format); - recupCoords($firstbase_read2,\$coord_start_read2,\$coord_end_read2,$tag_length->{$end_order_read2},$input_format); - - for(my $i=1;$i<=$chr->{$chr_read1}->{'nb_frag'};$i++){ #fast genome parsing for link creation - - if (abs ($coord_start_read1-${$chr->{$chr_read1}->{$i}}[0]) <= $window_dist){ - - if(overlap($coord_start_read1,$coord_end_read1,${$chr->{$chr_read1}->{$i}}[0],${$chr->{$chr_read1}->{$i}}[1])){ - - for(my $j=1;$j<=$chr->{$chr_read2}->{'nb_frag'};$j++){ - - if (abs ($coord_start_read2-${$chr->{$chr_read2}->{$j}}[0]) <= $window_dist) { - - if(overlap($coord_start_read2,$coord_end_read2,${$chr->{$chr_read2}->{$j}}[0],${$chr->{$chr_read2}->{$j}}[1])){ - - makeLink(\%link,$chr_read1,$i,$chr_read2,$j,$mate,\$nb_links); #make the link - } - - }else{ - - $j=getNextFrag($coord_start_read2,$j,${$chr->{$chr_read2}->{$j}}[0],$chr->{$chr_read2}->{nb_frag},$window_dist,$step); - } - } - } - - }else{ - - $i=getNextFrag($coord_start_read1,$i,${$chr->{$chr_read1}->{$i}}[0],$chr->{$chr_read1}->{nb_frag},$window_dist,$step); - } - } - - if($record>=$warn){ - print LOG "-- $fchr : $warn mate-pairs analysed - $nb_links links done\n"; - $warn+=10000; - } - } - $fh->close; - - if(!$nb_links){ - print LOG "-- $fchr : No mate-pairs !\n". - "-- $fchr : No links have been found with the selected type of structural variations \($sv_type\)\n"; - } - - print LOG "-- $fchr : Total : $record mate-pairs analysed - $nb_links links done\n"; - - print LOG "-- $fchr : writing...\n"; - - $fh = new FileHandle; - - $fh->open(">".$links_file) or die "$0: can't write in the output ".$links_file." :$!\n"; - - foreach my $chr1 ( sort { $a <=> $b} keys %link){ #Sorted links output - - foreach my $chr2 ( sort { $a <=> $b} keys %{$link{$chr1}}){ - - foreach my $frag1 ( sort { $a <=> $b} keys %{$link{$chr1}{$chr2}}){ - - foreach my $frag2 ( sort { $a <=> $b} keys %{$link{$chr1}{$chr2}{$frag1}}){ - - my @count=split(",",$link{$chr1}{$chr2}{$frag1}{$frag2}); - print $fh "$chr->{$chr1}->{name}\t".(${$chr->{$chr1}->{$frag1}}[0]+1)."\t".(${$chr->{$chr1}->{$frag1}}[1]+1)."\t". - "$chr->{$chr2}->{name}\t".(${$chr->{$chr2}->{$frag2}}[0]+1)."\t".(${$chr->{$chr2}->{$frag2}}[1]+1)."\t". - scalar @count."\t". #nb of read - $link{$chr1}{$chr2}{$frag1}{$frag2}."\n"; #mate list - } - } - } - } - - $fh->close; - - undef %link; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#remove exact links doublons according to the mate list -sub getUniqueLinks{ - - my ($links_file,$nrlinks_file)=@_; - my %links; - my %pt; - my $nb_links; - my $n=1; - - my $record=0; - my $warn=300000; - - my @sfile=split(/\./,$links_file); - my $fchr=$sfile[$#sfile-2]; - - my $fh = new FileHandle; - - print LOG "# $fchr : Getting unique links...\n"; - $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; - - while(<$fh>){ - - my @t=split; - my $mates=$t[7]; - $record++; - - if(!exists $links{$mates}){ #Unique links selection - - $links{$mates}=[@t]; - $pt{$n}=$links{$mates}; - $n++; - - - }else{ #get the link coordinates from the mate-pairs list - - for my $i (1,2,4,5){ #get the shortest regions - - $links{$mates}->[$i]=($t[$i]>$links{$mates}->[$i])? $t[$i]:$links{$mates}->[$i] #maximum start - if($i==1 || $i==4); - $links{$mates}->[$i]=($t[$i]<$links{$mates}->[$i])? $t[$i]:$links{$mates}->[$i] #minimum end - if($i==2 || $i==5); - } - } - if($record>=$warn){ - print LOG "-- $fchr : $warn links analysed - ".($n-1)." unique links done\n"; - $warn+=300000; - } - } - $fh->close; - - $nb_links=$n-1; - print LOG "-- $fchr : Total : $record links analysed - $nb_links unique links done\n"; - - $fh = new FileHandle; - $fh->open(">$nrlinks_file") or die "$0: can't write in the output: $nrlinks_file :$!\n"; - print LOG "-- $fchr : writing...\n"; - for my $i (1..$nb_links){ - - print $fh join("\t",@{$pt{$i}})."\n"; #all links output - } - - $fh->close; - - undef %links; - undef %pt; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#get the new coordinates of each link from the mate list -sub defineCoordsLinks{ - - my ($chr,$chrID,$pair,$input_format,$sv_type,$tag_length,$links_file,$clinks_file)=@_; - - my @sfile=split(/\./,$links_file); - my $fchr=$sfile[$#sfile-2]; - - my $fh = new FileHandle; - my $fh2 = new FileHandle; - - $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; - $fh2->open(">$clinks_file") or die "$0: can't write in the output: $clinks_file :$!\n"; - - print LOG "# $fchr : Defining precise link coordinates...\n"; - - my $record=0; - my $warn=100000; - - my %coords; - my %strands; - my %order; - my %ends_order; - - while(<$fh>){ - - - my ($col1,$col2)=(1,2); #for an intrachromosomal link - my $diffchr=0; #difference between chr1 and chr2 - my ($chr1,$chr2,$mates_list,$npairs)=(split)[0,3,7,8]; - ($chr1,$chr2) = ($chrID->{$chr1},$chrID->{$chr2}); - if ($chr1 != $chr2){ #for an interchromosomal link - $col1=$col2=0; #no distinction - $diffchr=1; - } - - my @pairs=split(",",$mates_list); - - $coords{$col1}{$chr1}->{start}=undef; - $coords{$col1}{$chr1}->{end}=undef; - $coords{$col2}{$chr2}->{start}=undef; - $coords{$col2}{$chr2}->{end}=undef; - $strands{$col1}{$chr1}=undef; - $strands{$col2}{$chr2}=undef; - $ends_order{$col1}{$chr1}=undef; - $ends_order{$col2}{$chr2}=undef; - - - $order{$col1}{$chr1}->{index}->{1}=undef; - $order{$col1}{$chr1}->{index}->{2}=undef; - $order{$col2}{$chr2}->{index}->{1}=undef; - $order{$col2}{$chr2}->{index}->{2}=undef; - $order{$col1}{$chr1}->{order}=undef; - $order{$col2}{$chr2}->{order}=undef; - - $record++; - - for my $p (0..$#pairs){ #for each pair - - my ($coord_start_read1,$coord_end_read1); - my ($coord_start_read2,$coord_end_read2); - my $strand_read1=recupCoords(${$$pair{$pairs[$p]}}[2],\$coord_start_read1,\$coord_end_read1,$tag_length->{${$$pair{$pairs[$p]}}[4]},$input_format); - my $strand_read2=recupCoords(${$$pair{$pairs[$p]}}[3],\$coord_start_read2,\$coord_end_read2,$tag_length->{${$$pair{$pairs[$p]}}[5]},$input_format); - - if(!$diffchr){ #for a intrachromosomal link - if($coord_start_read2<$coord_start_read1){ #get the closer start coordinate for each column - ($col1,$col2)=(2,1); - }else{ - ($col1,$col2)=(1,2); - } - } - - push(@{$coords{$col1}{${$$pair{$pairs[$p]}}[0]}->{start}},$coord_start_read1); #get coords and strands of f3 and r3 reads - push(@{$coords{$col1}{${$$pair{$pairs[$p]}}[0]}->{end}},$coord_end_read1); - push(@{$coords{$col2}{${$$pair{$pairs[$p]}}[1]}->{start}},$coord_start_read2); - push(@{$coords{$col2}{${$$pair{$pairs[$p]}}[1]}->{end}},$coord_end_read2); - push(@{$strands{$col1}{${$$pair{$pairs[$p]}}[0]}},$strand_read1); - push(@{$strands{$col2}{${$$pair{$pairs[$p]}}[1]}},$strand_read2); - push(@{$ends_order{$col1}{${$$pair{$pairs[$p]}}[0]}},${$$pair{$pairs[$p]}}[4]); - push(@{$ends_order{$col2}{${$$pair{$pairs[$p]}}[1]}},${$$pair{$pairs[$p]}}[5]); - } - - ($col1,$col2)=(1,2) if(!$diffchr); - - my $coord_start_chr1=min(min(@{$coords{$col1}{$chr1}->{start}}),min(@{$coords{$col1}{$chr1}->{end}})); #get the biggest region - my $coord_end_chr1=max(max(@{$coords{$col1}{$chr1}->{start}}),max(@{$coords{$col1}{$chr1}->{end}})); - my $coord_start_chr2=min(min(@{$coords{$col2}{$chr2}->{start}}),min(@{$coords{$col2}{$chr2}->{end}})); - my $coord_end_chr2=max(max(@{$coords{$col2}{$chr2}->{start}}),max(@{$coords{$col2}{$chr2}->{end}})); - - @{$order{$col1}{$chr1}->{index}->{1}}= sort {${$coords{$col1}{$chr1}->{start}}[$a] <=> ${$coords{$col1}{$chr1}->{start}}[$b]} 0 .. $#{$coords{$col1}{$chr1}->{start}}; - @{$order{$col2}{$chr2}->{index}->{1}}= sort {${$coords{$col2}{$chr2}->{start}}[$a] <=> ${$coords{$col2}{$chr2}->{start}}[$b]} 0 .. $#{$coords{$col2}{$chr2}->{start}}; - - foreach my $i (@{$order{$col1}{$chr1}->{index}->{1}}){ #get the rank of the chr2 reads according to the sorted chr1 reads (start coordinate sorting) - foreach my $j (@{$order{$col2}{$chr2}->{index}->{1}}){ - - if(${$order{$col1}{$chr1}->{index}->{1}}[$i] == ${$order{$col2}{$chr2}->{index}->{1}}[$j]){ - ${$order{$col1}{$chr1}->{index}->{2}}[$i]=$i; - ${$order{$col2}{$chr2}->{index}->{2}}[$i]=$j; - last; - } - } - } - - foreach my $i (@{$order{$col1}{$chr1}->{index}->{2}}){ #use rank chr1 as an ID - foreach my $j (@{$order{$col2}{$chr2}->{index}->{2}}){ - - if(${$order{$col1}{$chr1}->{index}->{2}}[$i] == ${$order{$col2}{$chr2}->{index}->{2}}[$j]){ - ${$order{$col1}{$chr1}->{order}}[$i]=$i+1; - ${$order{$col2}{$chr2}->{order}}[$i]=$j+1; - last; - } - } - } - - @pairs=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},\@pairs);#sorting of the pairs, strands, and start coords from the sorted chr2 reads - @{$strands{$col1}{$chr1}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$strands{$col1}{$chr1}); - @{$strands{$col2}{$chr2}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$strands{$col2}{$chr2}); - @{$ends_order{$col1}{$chr1}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$ends_order{$col1}{$chr1}); - @{$ends_order{$col2}{$chr2}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$ends_order{$col2}{$chr2}); - @{$coords{$col1}{$chr1}->{start}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$coords{$col1}{$chr1}->{start}); - @{$coords{$col2}{$chr2}->{start}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$coords{$col2}{$chr2}->{start}); - - - my @link=($chr->{$chr1}->{name}, $coord_start_chr1 , $coord_end_chr1, #all information output - $chr->{$chr2}->{name}, $coord_start_chr2 , $coord_end_chr2, - scalar @pairs, - join(",",@pairs), - join(",",@{$strands{$col1}{$chr1}}), - join(",",@{$strands{$col2}{$chr2}}), - join(",",@{$ends_order{$col1}{$chr1}}), - join(",",@{$ends_order{$col2}{$chr2}}), - join(",",@{$order{$col1}{$chr1}->{order}}), - join(",",@{$order{$col2}{$chr2}->{order}}), - join(",",@{$coords{$col1}{$chr1}->{start}}), - join(",",@{$coords{$col2}{$chr2}->{start}})); - - print $fh2 join("\t",@link)."\n"; - - if($record>=$warn){ - print LOG "-- $fchr : $warn links processed\n"; - $warn+=100000; - } - } - $fh->close; - $fh2->close; - - print LOG "-- $fchr : Total : $record links processed\n"; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#Sort links according the concerned chromosomes and their coordinates -sub sortLinks{ - - my ($links_file,$sortedlinks_file,$unique)=@_; - - my @sfile=split(/\./,$links_file); - my $fchr=$sfile[$#sfile-2]; - - - print LOG "# $fchr : Sorting links...\n"; - - my $pipe=($unique)? "| sort -u":""; - system "sort -k 1,1 -k 4,4 -k 2,2n -k 5,5n -k 8,8n $links_file $pipe > $sortedlinks_file"; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#removal of fully overlapped links -sub removeFullyOverlappedLinks{ - - my ($links_file,$nrlinks_file,$warn_out)=@_; - - my %pt; - my $n=1; - - my @sfile=split(/\./,$links_file); - my $fchr=$sfile[$#sfile-2]; - - my $fh = new FileHandle; - - $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; - while(<$fh>){ - - my @t=split("\t",$_); - $pt{$n}=[@t]; - $n++; - } - $fh->close; - - my $nb_links=$n-1; - my $nb=$nb_links; - - my %pt2; - my $nb2=1; - my $record=0; - my $warn=10000; - - print LOG "# $fchr : Removing fully overlapped links...\n"; - - LINK: - - for my $i (1..$nb){ - - my @link=(); - my @next_link=(); - my $ind1=$i; - - $record++; - if($record>=$warn){ - print LOG "-- $fchr : $warn unique links analysed - ".($nb2-1)." non-overlapped links done\n"; - $warn+=10000; - } - - if(exists $pt{$ind1}){ - @link=@{$pt{$ind1}}; #link1 - }else{ - next LINK; - } - - my ($chr1,$start1,$end1,$chr2,$start2,$end2)=($link[0],$link[1],$link[2],$link[3],$link[4],$link[5]); #get info of link1 - my @mates=deleteBadOrderSensePairs(split(",",$link[7])); - - my $ind2=$ind1+1; - $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb); #get the next found link - - if($ind2<=$nb){ - - @next_link=@{$pt{$ind2}}; #link2 - my ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]); #get info of link2 - my @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7])); - - while(($chr1 eq $chr3 && $chr2 eq $chr4) && overlap($start1,$end1,$start3,$end3)){ #loop here according to the chr1 coordinates, need an overlap between links to enter - - if(!overlap($start2,$end2,$start4,$end4)){ #if no overlap with chr2 coordinates ->next link2 - - $ind2++; - $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb); - - if($ind2>$nb){ #if no more link in the file -> save link1 - - $pt2{$nb2}=\@link; - $nb2++; - next LINK; - } - - @next_link=@{$pt{$ind2}}; - ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]); - @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7])); - next; - } - - my %mates=map{$_ =>1} @mates; #get the equal number of mates - my @same_mates = grep( $mates{$_}, @next_mates ); - my $nb_mates= scalar @same_mates; - - if($nb_mates == scalar @mates){ - - delete $pt{$ind1}; #if pairs of link 1 are all included in link 2 -> delete link1 - next LINK; #go to link2, link2 becomes link1 - - }else{ - delete $pt{$ind2} if($nb_mates == scalar @next_mates); #if pairs of link2 are all included in link 1 -> delete link2 - $ind2++; #we continue by checking the next link2 - $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb); - - if($ind2>$nb){ #if no more link in the file -> save link1 - - $pt2{$nb2}=\@link; - $nb2++; - next LINK; - } - - @next_link=@{$pt{$ind2}}; #get info of link2 - ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]); - @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7])); - - } - } - } - $pt2{$nb2}=\@link; #if no (more) link with chr1 coordinates overlap -> save link1 - $nb2++; - } - - print LOG "-- $fchr : Total : $nb_links unique links analysed - ".($nb2-1)." non-overlapped links done\n"; - - #OUTPUT - - $fh = new FileHandle; - $fh->open(">$nrlinks_file") or die "$0: can't write in the output: $nrlinks_file :$!\n"; - print LOG "-- $fchr : writing...\n"; - for my $i (1..$nb2-1){ - - print $fh join("\t",@{$pt2{$i}}); #all links output - } - - close $fh; - - print LOG "-- $fchr : output created: $nrlinks_file\n" if($warn_out); - - undef %pt; - undef %pt2; -} -#------------------------------------------------------------------------------# -sub postFiltering { - - my ($links_file,$pflinks_file, $finalScore_thres)=@_; - - my @sfile=split(/\./,$links_file); - my $fchr=$sfile[$#sfile-2]; - - - my ($nb,$nb2)=(0,0); - - print LOG "# $fchr : Post-filtering links...\n"; - print LOG "-- $fchr : final score threshold = $finalScore_thres\n"; - - my $fh = new FileHandle; - my $fh2 = new FileHandle; - - $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; - $fh2->open(">$pflinks_file") or die "$0: can't write in the output: $pflinks_file :$!\n"; - - - while(<$fh>){ - - my @t=split("\t",$_); - my $score=$t[$#t-1]; - - if($score >= $finalScore_thres){ - print $fh2 join("\t", @t); - $nb2++; - } - $nb++; - } - $fh->close; - $fh2->close; - - print LOG "-- $fchr : Total : $nb unique links analysed - $nb2 links kept\n"; - print LOG "-- $fchr : output created: $pflinks_file\n"; -} - - - -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#Filtering of the links -sub strandFiltering{ - - my($chr,$chrID,$pairs_threshold,$strand_filtering,$chromosomes, - $input_format,$cmap_file,$mate_sense, $tag_length,$links_file,$flinks_file)=@_; - - my @sfile=split(/\./,$links_file); - my $fchr=$sfile[$#sfile-1]; - - - my %chrs; - my %chrs1; - my %chrs2; - my $nb_chrs; - my $exclude; - - if($chromosomes){ - my @chrs=split(",",$chromosomes); - $nb_chrs=scalar @chrs; - $exclude=($chrs[0]=~/^\-/)? 1:0; - for my $chrName (@chrs){ - $chrName=~s/^(\-)//; - my $col=($chrName=~s/_(1|2)$//); - - if(!$col){ - $chrs{$chrID->{$chrName}}=undef - }else{ - $chrs1{$chrID->{$chrName}}=undef if($1==1); - $chrs2{$chrID->{$chrName}}=undef if($1==2); - } - } - } - - my $record=0; - my $nb_links=0; - my $warn=10000; - - my $sens_ratio_threshold=0.6; - - print LOG "\# Filtering procedure...\n"; - print LOG "\# Number of pairs and strand filtering...\n"; - print LOG "-- file=$links_file\n"; - print LOG "-- nb_pairs_threshold=$pairs_threshold, strand_filtering=".(($strand_filtering)? "yes":"no"). - ", chromosomes=".(($chromosomes)? "$chromosomes":"all")."\n"; - - - - my $fh = new FileHandle; - my $fh2 = new FileHandle; - - $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; - $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n"; - - while(<$fh>){ - - my @t=split; #for each link - my $is_good=1; - $record++; - - - if($chromosomes){ - - my ($chr1,$chr2)=($chrID->{$t[0]},$chrID->{$t[3]}); - - if(!$exclude){ - $is_good=(exists $chrs{$chr1} && exists $chrs{$chr2})? 1:0; - $is_good=(exists $chrs1{$chr1} && exists $chrs2{$chr2})? 1:0 if(!$is_good); - $is_good=($nb_chrs==1 && (exists $chrs1{$chr1} || exists $chrs2{$chr2}))? 1:0 if(!$is_good); - }else{ - $is_good=(exists $chrs{$chr1} || exists $chrs{$chr2})? 0:1; - $is_good=(exists $chrs1{$chr1} || exists $chrs2{$chr2})? 0:1 if($is_good); - } - } - - $is_good = ($is_good && $t[6] >= $pairs_threshold)? 1 :0; #filtering according the number of pairs - if($is_good && $strand_filtering){ #if filtering according the strand sense - - my @mates=split(/,/,$t[7]); #get the concordant pairs in the strand sense - my @strands1=split(/,/,$t[8]); - my @strands2=split(/,/,$t[9]); - - my %mate_class=( 'FF' => 0, 'RR' => 0, 'FR' => 0, 'RF' => 0); - - my %mate_reverse=( 'FF' => 'RR', 'RR' => 'FF', #group1: FF,RR - 'FR' => 'RF', 'RF' => 'FR'); #group2: FR,RF - - my %mate_class2=( $mate_sense=>"NORMAL_SENSE", inverseSense($mate_sense)=>"NORMAL_SENSE", - substr($mate_sense,0,1).inverseSense(substr($mate_sense,1,1))=>"REVERSE_SENSE", - inverseSense(substr($mate_sense,0,1)).substr($mate_sense,1,1)=>"REVERSE_SENSE"); - - if($t[6] == 1){ - - push(@t,$mate_class2{$strands1[0].$strands2[0]},"1/1",1,1); - - }else{ - - tie (my %class,'Tie::IxHash'); - my $split; - - foreach my $i (0..$#mates){ - $mate_class{$strands1[$i].$strands2[$i]}++; #get the over-represented group - } - - my $nb_same_sens_class=$mate_class{FF}+$mate_class{RR}; - my $nb_diff_sens_class=$mate_class{FR}+$mate_class{RF}; - my $sens_ratio=max($nb_same_sens_class,$nb_diff_sens_class)/($nb_same_sens_class+$nb_diff_sens_class); - - if($sens_ratio < $sens_ratio_threshold){ - %class=(1=>'FF', 2=>'FR'); - $split=1; - }else{ - $class{1}=($nb_same_sens_class > $nb_diff_sens_class)? 'FF':'FR'; #if yes get the concerned class - $split=0; - } - - $is_good=getConsistentSenseLinks(\@t,\@mates,\@strands1,\@strands2,$tag_length,$mate_sense,\%mate_reverse,\%mate_class2,\%class,$split,$pairs_threshold); - } - } - - if($is_good){ #PRINT - - my $nb=scalar @t; - if($nb > 20){ - my @t2=splice(@t,0,20); - print $fh2 join("\t",@t2)."\n"; - $nb_links++; - } - $nb_links++; - print $fh2 join("\t",@t)."\n"; - } - - if($record>=$warn){ - print LOG "-- $fchr : $warn links analysed - $nb_links links kept\n"; - $warn+=10000; - } - } - $fh->close; - $fh2->close; - - print LOG "-- $fchr : No links have been found with the selected filtering parameters\n" if(!$nb_links); - - print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n"; - - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub getConsistentSenseLinks{ - - my ($t,$mates,$strands1,$strands2,$tag_length,$mate_sense, $mate_reverse,$mate_class2, $class, $split,$thres)=@_; - - my $npairs=scalar @$mates; - - my @ends_order1 = split (/,/,$$t[10]); - my @ends_order2 = split (/,/,$$t[11]); - my @order1 = split (/,/,$$t[12]); - my @order2 = split (/,/,$$t[13]); - my @positions1 = split (/,/,$$t[14]); - my @positions2 = split (/,/,$$t[15]); - - my @newlink; - - foreach my $ind (keys %{$class} ){ - - tie (my %flink,'Tie::IxHash'); - my @orders2remove=(); - - foreach my $i (0..$#{$mates}){ #get the pairs belonging the over-represented group - - if((($$strands1[$i].$$strands2[$i]) eq $$class{$ind}) || (($$strands1[$i].$$strands2[$i]) eq $$mate_reverse{$$class{$ind}})){ - push(@{$flink{mates}},$$mates[$i]); - push(@{$flink{strands1}},$$strands1[$i]); - push(@{$flink{strands2}},$$strands2[$i]); - push(@{$flink{ends_order1}},$ends_order1[$i]); - push(@{$flink{ends_order2}},$ends_order2[$i]); - push(@{$flink{positions1}},$positions1[$i]); - push(@{$flink{positions2}},$positions2[$i]); - - }else{ - push(@orders2remove,$order1[$i]); - } - } - - @{$flink{order1}}=(); - @{$flink{order2}}=(); - if(scalar @orders2remove > 0){ - getNewOrders(\@order1,\@order2,\@orders2remove,$flink{order1},$flink{order2}) - }else{ - @{$flink{order1}}=@order1; - @{$flink{order2}}=@order2; - } - - my @ends1; getEnds(\@ends1,$flink{positions1},$flink{strands1},$flink{ends_order1},$tag_length); - my @ends2; getEnds(\@ends2,$flink{positions2},$flink{strands2},$flink{ends_order2},$tag_length); - - my $fnpairs=scalar @{$flink{mates}}; - my $strand_filtering_ratio=$fnpairs."/".$npairs; - my $real_ratio=$fnpairs/$npairs; - - if($fnpairs>=$thres){ #filtering according the number of pairs - - push(@newlink, - $$t[0], - min(min(@{$flink{positions1}}),min(@ends1)), - max(max(@{$flink{positions1}}),max(@ends1)), - $$t[3], - min(min(@{$flink{positions2}}),min(@ends2)), - max(max(@{$flink{positions2}}),max(@ends2)), - $fnpairs, - join(",",@{$flink{mates}}), - join(",",@{$flink{strands1}}), - join(",",@{$flink{strands2}}), - join(",",@{$flink{ends_order1}}), - join(",",@{$flink{ends_order2}}), - join(",",@{$flink{order1}}), - join(",",@{$flink{order2}}), - join(",",@{$flink{positions1}}), - join(",",@{$flink{positions2}}), - $$mate_class2{${$flink{strands1}}[0].${$flink{strands2}}[0]}, - $strand_filtering_ratio, - $real_ratio, - $npairs - ); - } - } - - if (grep {defined($_)} @newlink) { - @$t=@newlink; - return 1 - } - return 0; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub getNewOrders{ - - my($tab1,$tab2,$list,$newtab1,$newtab2)=@_; - my $j=1; - my $k=1; - for my $i (0..$#{$tab2}){ - my $c=0; - for my $j (0..$#{$list}){ - $c++ if(${$list}[$j] < ${$tab2}[$i]); - if(${$list}[$j] == ${$tab2}[$i]){ - $c=-1; last; - } - } - if($c!=-1){ - push(@{$newtab2}, ${$tab2}[$i]-$c); - push(@{$newtab1}, $k); - $k++; - } - } -} - -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#Filtering of the links using their order -sub orderFiltering { - - my ($chr,$chrID,$nb_pairs_threshold,$nb_pairs_order_threshold,$mu,$sigma,$mate_sense,$tag_length,$links_file,$flinks_file)=@_; - - my @sfile=split(/\./,$links_file); - my $fchr=$sfile[$#sfile-2]; - - - my $diff_sense_ends=(($mate_sense eq "FR") || ($mate_sense eq "RF"))? 1:0; - - my $record=0; - my $warn=10000; - my $nb_links=0; - - my $quant05 = 1.644854; - my $quant001 = 3.090232; - my $alphaDist = $quant05 * 2 * $sigma; - my $maxFragmentLength = &floor($quant001 * $sigma + $mu); - - print LOG "\# Filtering by order...\n"; - print LOG "-- mu length=$mu, sigma length=$sigma, nb pairs order threshold=$nb_pairs_order_threshold\n"; - print LOG "-- distance between comparable pairs was set to $alphaDist\n"; - print LOG "-- maximal fragment length was set to $maxFragmentLength\n"; - - - my $fh = new FileHandle; - my $fh2 = new FileHandle; - - $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; - $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n"; - - while(<$fh>){ - - $record++; - my @t = split; - my ($chr1,$chr2,$mates_list)=@t[0,3,7]; - my @pairs=split(",",$mates_list); - ($chr1,$chr2) = ($chrID->{$chr1},$chrID->{$chr2}); - my ($coord_start_chr1,$coord_end_chr1,$coord_start_chr2,$coord_end_chr2) = @t[1,2,4,5]; - my $numberOfPairs = $t[6]; - my @strand1 = split (/,/,$t[8]); - my @strand2 = split (/,/,$t[9]); - my @ends_order1 = split (/,/,$t[10]); - my @ends_order2 = split (/,/,$t[11]); - my @order1 = split (/,/,$t[12]); - my @order2 = split (/,/,$t[13]); - my @positions1 = split (/,/,$t[14]); - my @positions2 = split (/,/,$t[15]); - my @ends1; getEnds(\@ends1,\@positions1,\@strand1,\@ends_order1,$tag_length); - my @ends2; getEnds(\@ends2,\@positions2,\@strand2,\@ends_order2,$tag_length); - my $clusterCoordinates_chr1; - my $clusterCoordinates_chr2; - my $reads_left = 0; - - my $ifRenv = $t[16]; - my $strand_ratio_filtering=$t[17]; - - #kind of strand filtering. For example, will keep only FFF-RRR from a link FFRF-RRRF if orientation is correct - my ($singleBreakpoint, %badInFRSense) = findBadInFRSenseSOLiDSolexa(\@strand1,\@strand2,\@ends_order1,\@ends_order2,\@order1,\@order2,$mate_sense); - #find pairs type F-RRRR or FFFF-R in the case if orientation is correct - #These pairs are annotated as BED pairs forever! They won't be recycled! - my $table; - for my $i (0..$numberOfPairs-1) { #fill the table with non adequate pairs: pairID numberOfNonAdPairs nonAdPairIDs - my $nonAdeq = 0; - for my $j (0..$i-1) { - if (exists($table->{$j}->{$i})) { - $nonAdeq++; - $table->{$i}->{$j} = 1; - } - } - for my $j ($i+1..$numberOfPairs-1) { - if ($positions1[$j]-$positions1[$i]>$alphaDist) { - if (&reversed ($i,$j,$ifRenv,\@positions2)) { - $nonAdeq++; - $table->{$i}->{$j} = 1; - } - } - } - $table->{$i}->{nonAdeq} = $nonAdeq; - } - - for my $bad (keys %badInFRSense) { #remove pairs type F-RRRR or FFFF-R in the case of orientation - &remove($bad,$table); - } - - my @falseReads; - #RRRR-F -> RRRR or R-FFFF -> FFFF - @falseReads = findBadInRFSenseSOLiDSolexa(\@strand1,\@ends_order1,$mate_sense, keys %{$table}); - #these pairs will be recycled later as $secondTable - for my $bad (@falseReads) { - &remove($bad,$table); - } - - my $bad = &check($table); - while ($bad ne "OK") { #clear the table to reject non adequate pairs in the sense of ORDER - # push (@falseReads, $bad); remove completely!!! - &remove($bad,$table); - $bad = &check($table); - } - - $reads_left = scalar keys %{$table}; - my $coord_start_chr1_cluster1 = min(min(@positions1[sort {$a<=>$b} keys %{$table}]),min(@ends1[sort {$a<=>$b} keys %{$table}])); - my $coord_end_chr1_cluster1 = max(max(@positions1[sort {$a<=>$b} keys %{$table}]),max(@ends1[sort {$a<=>$b} keys %{$table}])); - my $coord_start_chr2_cluster1 = min(min(@positions2[sort {$a<=>$b} keys %{$table}]),min(@ends2[sort {$a<=>$b} keys %{$table}])); - my $coord_end_chr2_cluster1 = max(max(@positions2[sort {$a<=>$b} keys %{$table}]),max(@ends2[sort {$a<=>$b} keys %{$table}])); - - $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')'; - $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')'; - - my $ifBalanced = 'UNBAL'; - my $secondTable; - my $clusterCoordinates; - my ($break_pont_chr1,$break_pont_chr2); - - my $signatureType=""; - - my $maxCoord1 =$chr->{$chr1}->{length}; - my $maxCoord2 =$chr->{$chr2}->{length}; - - if (scalar @falseReads) { - @falseReads = sort @falseReads; - #now delete FRFR choosing the majority - my @newfalseReads; #find and remove pairs type RRRR-F or R-FFFF - @newfalseReads = findBadInRFSenseSOLiDSolexa(\@strand1,\@ends_order1,$mate_sense,@falseReads); #these @newfalseReads won't be recycled - my %hashTmp; - for my $count1 (0..scalar(@falseReads)-1) { - my $i = $falseReads[$count1]; - $hashTmp{$i} = 1; - for my $bad (@newfalseReads) { - if ($bad == $i) { - delete $hashTmp{$i}; - next; - } - } - } - @falseReads = sort keys %hashTmp; #what is left - for my $count1 (0..scalar(@falseReads)-1) { #fill the table for reads which were previously rejected - my $nonAdeq = 0; - my $i = $falseReads[$count1]; - - for my $count2 (0..$count1-1) { - my $j = $falseReads[$count2]; - if (exists($secondTable->{$j}->{$i})) { - $nonAdeq++; - $secondTable->{$i}->{$j} = 1; - } - } - for my $count2 ($count1+1..scalar(@falseReads)-1) { - my $j = $falseReads[$count2]; - if ($positions1[$j]-$positions1[$i]>$alphaDist) { - if (&reversed ($i,$j,$ifRenv,\@positions2)) { - $nonAdeq++; - $secondTable->{$i}->{$j} = 1; - } - } - } - $secondTable->{$i}->{nonAdeq} = $nonAdeq; - } - - my @falseReads2; - my $bad = &check($secondTable); - while ($bad ne "OK") { #clear the table to reject non adequate pairs - push (@falseReads2, $bad); - &remove($bad,$secondTable); - $bad = &check($secondTable); - } - if (scalar keys %{$secondTable} >= $nb_pairs_order_threshold) { - my $coord_start_chr1_cluster2 = min(min(@positions1[sort {$a<=>$b} keys %{$secondTable}]),min(@ends1[sort {$a<=>$b} keys %{$secondTable}])); - my $coord_end_chr1_cluster2 = max(max(@positions1[sort {$a<=>$b} keys %{$secondTable}]),max(@ends1[sort {$a<=>$b} keys %{$secondTable}])); - my $coord_start_chr2_cluster2 = min(min(@positions2[sort {$a<=>$b} keys %{$secondTable}]),min(@ends2[sort {$a<=>$b} keys %{$secondTable}])); - my $coord_end_chr2_cluster2 = max(max(@positions2[sort {$a<=>$b} keys %{$secondTable}]),max(@ends2[sort {$a<=>$b} keys %{$secondTable}])); - - $ifBalanced = 'BAL'; - - if ($ifBalanced eq 'BAL') { - - if (scalar keys %{$table} < $nb_pairs_order_threshold) { - $ifBalanced = 'UNBAL'; #kill cluster 1! - ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2 - $reads_left = scalar keys %{$table}; - $coord_start_chr1_cluster1 = $coord_start_chr1_cluster2; - $coord_end_chr1_cluster1 = $coord_end_chr1_cluster2; - $coord_start_chr2_cluster1 = $coord_start_chr2_cluster2; - $coord_end_chr2_cluster1 = $coord_end_chr2_cluster2; - $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')'; - $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')'; - - } else { - - $reads_left += scalar keys %{$secondTable}; - next if ($reads_left < $nb_pairs_threshold); - - if ($coord_end_chr1_cluster2 < $coord_start_chr1_cluster1) { - ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2 - - ($coord_start_chr1_cluster1,$coord_start_chr1_cluster2) = ($coord_start_chr1_cluster2,$coord_start_chr1_cluster1); - ($coord_end_chr1_cluster1,$coord_end_chr1_cluster2)=($coord_end_chr1_cluster2,$coord_end_chr1_cluster1); - ($coord_start_chr2_cluster1,$coord_start_chr2_cluster2)=($coord_start_chr2_cluster2,$coord_start_chr2_cluster1); - ($coord_end_chr2_cluster1 , $coord_end_chr2_cluster2)=($coord_end_chr2_cluster2 , $coord_end_chr2_cluster1); - - $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.'),'.$clusterCoordinates_chr1; - $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.'),'.$clusterCoordinates_chr2; - }else { - $clusterCoordinates_chr1 .= ',('.$coord_start_chr1_cluster2.','.$coord_end_chr1_cluster2.')'; - $clusterCoordinates_chr2 .= ',('.$coord_start_chr2_cluster2.','.$coord_end_chr2_cluster2.')'; - } - $coord_start_chr1 = min($coord_start_chr1_cluster1,$coord_start_chr1_cluster2); - $coord_end_chr1 = max($coord_end_chr1_cluster1,$coord_end_chr1_cluster2); - $coord_start_chr2 = min($coord_start_chr2_cluster1,$coord_start_chr2_cluster2); - $coord_end_chr2 = max($coord_end_chr2_cluster1,$coord_end_chr2_cluster2); - #to calculate breakpoints one need to take into account read orientation in claster.. - my $leftLetterOk = substr($mate_sense, 0, 1); #R - my $rightLetterOk = substr($mate_sense, 1, 1); #F - - - my @index1 = keys %{$table}; - my @index2 = keys %{$secondTable}; - - my (@generalStrand1,@generalStrand2) = 0; - - if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs - $leftLetterOk = 'R'; - $rightLetterOk = 'F'; - @generalStrand1 = translateSolidToRF(\@strand1,\@ends_order1); - @generalStrand2 = translateSolidToRF(\@strand2,\@ends_order2); - } else { - @generalStrand1 = @strand1; - @generalStrand2 = @strand2; # TODO check if it is correct - } - if ($generalStrand1[$index1[0]] eq $leftLetterOk && $generalStrand1[$index2[0]] eq $rightLetterOk) { #(R,F) - $break_pont_chr1 = '('.$coord_end_chr1_cluster1.','.$coord_start_chr1_cluster2.')'; - - if ($generalStrand2[$index1[0]] eq $rightLetterOk && $generalStrand2[$index2[0]] eq $leftLetterOk) { - if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { - $break_pont_chr2 = '('.$coord_end_chr2_cluster2.','.$coord_start_chr2_cluster1.')'; - $signatureType = "TRANSLOC"; - } else { - $break_pont_chr2 = '('.max(($coord_end_chr2_cluster1-$maxFragmentLength),1).','.$coord_start_chr2_cluster1.')'; - $break_pont_chr2 .= ',('.$coord_end_chr2_cluster2.','.min(($coord_start_chr2_cluster2+$maxFragmentLength),$maxCoord2).')'; - $signatureType = "INS_FRAGMT"; - } - - } elsif ($generalStrand2[$index1[0]] eq $leftLetterOk && $generalStrand2[$index2[0]] eq $rightLetterOk) { - if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { - $break_pont_chr2 = '('.max(($coord_end_chr2_cluster2-$maxFragmentLength),1).','.$coord_start_chr2_cluster2.')'; - $break_pont_chr2 .= ',('.$coord_end_chr2_cluster1.','.min(($coord_start_chr2_cluster1+$maxFragmentLength),$maxCoord2).')'; - $signatureType = "INV_INS_FRAGMT"; - } else { - $break_pont_chr2 = '('.$coord_end_chr2_cluster1.','.$coord_start_chr2_cluster2.')'; - $signatureType = "INV_TRANSLOC"; - } - } else { - #should not occur - print STDERR "\nError in orderFiltering\n\n"; - } - } - - elsif ($generalStrand1[$index1[0]] eq $rightLetterOk && $generalStrand1[$index2[0]] eq $leftLetterOk) { #(F,R) - $break_pont_chr1 = '('.max(($coord_end_chr1_cluster1-$maxFragmentLength),1).','.$coord_start_chr1_cluster1.')'; - $break_pont_chr1 .= ',('.$coord_end_chr1_cluster2.','.min(($coord_start_chr1_cluster2+$maxFragmentLength),$maxCoord1).')'; - if ($generalStrand2[$index1[0]] eq $rightLetterOk && $generalStrand2[$index2[0]] eq $leftLetterOk) { - if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { - $break_pont_chr2 = '('.$coord_end_chr2_cluster2.','.$coord_start_chr2_cluster1.')'; - $signatureType = "INV_INS_FRAGMT"; - } else { - $break_pont_chr2 = '('.max(($coord_end_chr2_cluster1-$maxFragmentLength),1).','.$coord_start_chr2_cluster1.')'; - $break_pont_chr2 .= ',('.$coord_end_chr2_cluster2.','.min(($coord_start_chr2_cluster2+$maxFragmentLength),$maxCoord2).')'; - $signatureType = "INV_COAMPLICON"; - } - - } elsif ($generalStrand2[$index1[0]] eq $leftLetterOk && $generalStrand2[$index2[0]] eq $rightLetterOk) { - if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { - $break_pont_chr2 = '('.max(($coord_end_chr2_cluster2-$maxFragmentLength),1).','.$coord_start_chr2_cluster2.')'; - $break_pont_chr2 .= ',('.$coord_end_chr2_cluster1.','.min(($coord_start_chr2_cluster1+$maxFragmentLength),$maxCoord2).')'; - $signatureType = "COAMPLICON"; - } else { - $break_pont_chr2 = '('.$coord_end_chr2_cluster1.','.$coord_start_chr2_cluster2.')'; - $signatureType = "INS_FRAGMT"; - } - } else { - #should not occur - $signatureType = "UNDEFINED"; - } - } - else { # (F,F) or (R,R) something strange. We will discard the smallest cluster - $ifBalanced = 'UNBAL'; - if (scalar keys %{$secondTable} > scalar keys %{$table}) { - ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2 - - $coord_start_chr1_cluster1 = $coord_start_chr1_cluster2; - $coord_end_chr1_cluster1 = $coord_end_chr1_cluster2; - $coord_start_chr2_cluster1 = $coord_start_chr2_cluster2; - $coord_end_chr2_cluster1 = $coord_end_chr2_cluster2; - $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')'; - $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')'; - } - $reads_left = scalar keys %{$table}; - } - if ($ifBalanced eq 'BAL') { - $ifRenv = $signatureType; - } - } - } - } - } - if ($ifBalanced ne 'BAL') { - #define possible break point - $coord_start_chr1 = $coord_start_chr1_cluster1; - $coord_end_chr1 = $coord_end_chr1_cluster1; - $coord_start_chr2 = $coord_start_chr2_cluster1; - $coord_end_chr2 = $coord_end_chr2_cluster1; - - my $region_length_chr1 = $coord_end_chr1-$coord_start_chr1; - my $region_length_chr2 = $coord_end_chr2-$coord_start_chr2; - - my $leftLetterOk = substr($mate_sense, 0, 1); #R - my $rightLetterOk = substr($mate_sense, 1, 1); #F - - my @index = keys %{$table}; - unless ($diff_sense_ends) { - my $firstEndOrder1 = $ends_order1[$index[0]]; - my $firstEndOrder2 = $ends_order2[$index[0]]; - $break_pont_chr1 = (($strand1[$index[0]] eq 'R' && $firstEndOrder1 == 2) || ($strand1[$index[0]] eq 'F' && $firstEndOrder1 == 1))?'('.$coord_end_chr1.','.min(($coord_start_chr1+$maxFragmentLength),$maxCoord1).')':'('.max(($coord_end_chr1-$maxFragmentLength),1).','.$coord_start_chr1.')'; - $break_pont_chr2 = (($strand2[$index[0]] eq 'R' && $firstEndOrder2 == 2) || ($strand2[$index[0]] eq 'F' && $firstEndOrder2 == 1))?'('.$coord_end_chr2.','.min(($coord_start_chr2+$maxFragmentLength),$maxCoord2).')':'('.max(($coord_end_chr2-$maxFragmentLength),1).','.$coord_start_chr2.')'; - } else { - $break_pont_chr1 = ($strand1[$index[0]] eq $leftLetterOk )?'('.$coord_end_chr1.','.min(($coord_start_chr1+$maxFragmentLength),$maxCoord1).')':'('.max(($coord_end_chr1-$maxFragmentLength),1).','.$coord_start_chr1.')'; - $break_pont_chr2 = ($strand2[$index[0]] eq $leftLetterOk )?'('.$coord_end_chr2.','.min(($coord_start_chr2+$maxFragmentLength),$maxCoord2).')':'('.max(($coord_end_chr2-$maxFragmentLength),1).','.$coord_start_chr2.')'; - } - - if ($chr1 ne $chr2){ - $ifRenv="INV_TRANSLOC" if($ifRenv eq "REVERSE_SENSE"); - $ifRenv="TRANSLOC" if($ifRenv eq "NORMAL_SENSE"); - } - } - - if (($ifBalanced eq 'BAL')&&( (scalar keys %{$table}) + (scalar keys %{$secondTable}) < $nb_pairs_threshold)) { - next; #discard the link - } - if (($ifBalanced eq 'UNBAL')&&(scalar keys %{$table} < $nb_pairs_threshold)) { - next; #discard the link - } - my $ratioTxt = "$reads_left/".(scalar @pairs); - my ($n1,$nTot) = split ("/",$strand_ratio_filtering); - my $ratioReal = $reads_left/$nTot; - - if ($coord_start_chr1<=0) { - $coord_start_chr1=1; - } - if ($coord_start_chr2<=0) { - $coord_start_chr2=1; - } - #create output - my @link=($chr->{$chr1}->{name}, $coord_start_chr1 , $coord_end_chr1, #all information output - $chr->{$chr2}->{name}, $coord_start_chr2 , $coord_end_chr2, - $reads_left, - &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@pairs), - &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@strand1), - &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@strand2), - &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@ends_order1), - &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@ends_order2), - &redraw(2,$table,$secondTable,\%badInFRSense,$ifBalanced,\@order1), - &redraw(2,$table,$secondTable,\%badInFRSense,$ifBalanced,\@order2), - &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@positions1), - &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@positions2), - $ifRenv, - $strand_ratio_filtering, - $ifBalanced, $ratioTxt, $break_pont_chr1, $break_pont_chr2, - $ratioReal, $nTot); - - $nb_links++; - print $fh2 join("\t",@link)."\n"; - - if($record>=$warn){ - print LOG "-- $fchr : $warn links analysed - $nb_links links kept\n"; - $warn+=10000; - } - - } - $fh->close; - $fh2->close; - - print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n"; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#gets information about ends positions given start, direction and order -sub getEnds { - my ($ends,$starts,$strand,$end_order,$tag_length) = @_; - for my $i (0..scalar(@{$starts})-1) { - $ends->[$i] = getEnd($starts->[$i],$strand->[$i],$end_order->[$i],$tag_length); - } -} -sub getEnd { - my ($start,$strand, $end_order,$tag_length) = @_; - return ($strand eq 'F')? $start+$tag_length->{$end_order}-1:$start-$tag_length->{$end_order}+1; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#gets starts and ends Coords when start=leftmost given positions, directions and orders -sub getCoordswithLeftMost { - - my ($starts,$ends,$positions,$strand,$end_order,$tag_length) = @_; - - for my $i (0..scalar(@{$positions})-1) { - - if($strand->[$i] eq 'F'){ - $starts->[$i]=$positions->[$i]; - $ends->[$i]=$positions->[$i]+$tag_length->{$end_order->[$i]}-1; - }else{ - $starts->[$i]=$positions->[$i]-$tag_length->{$end_order->[$i]}+1; - $ends->[$i]=$positions->[$i]; - } - } -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub addInsertionInfo { #add field with INS,DEL,NA and distance between clusters and performs filtering - - my ($chr,$chrID,$nb_pairs_threshold,$order_filtering,$indel_sigma_threshold,$dup_sigma_threshold,$singleton_sigma_threshold,$mu,$sigma,$mate_sense,$tag_length,$links_file,$flinks_file)=@_; - - my @sfile=split(/\./,$links_file); - my $fchr=$sfile[$#sfile-2]; - - - my $diff_sense_ends=(($mate_sense eq "FR") || ($mate_sense eq "RF"))? 1:0; - - my $record=0; - my $nb_links=0; - my $warn=10000; - - print LOG "\# Filtering out normal pairs using insert size...\n"; - print LOG "-- mu length=$mu, sigma length=$sigma, indel sigma threshold=$indel_sigma_threshold, dup sigma threshold=$dup_sigma_threshold\n"; - print LOG "-- using ".($mu-$indel_sigma_threshold*$sigma)."-". - ($mu+$indel_sigma_threshold*$sigma)." as normal range of insert size for indels\n"; - print LOG "-- using ".($mu-$dup_sigma_threshold*$sigma)."-". - ($mu+$dup_sigma_threshold*$sigma)." as normal range of insert size for duplications\n"; - print LOG "-- using ".($mu-$singleton_sigma_threshold*$sigma)." as the upper limit of insert size for singletons\n" if($mate_sense eq "RF"); - - my $fh = new FileHandle; - my $fh2 = new FileHandle; - - $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; - $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n"; - - while(<$fh>){ - - $record++; - my @t = split; - my ($chr1,$chr2,$mates_list)=@t[0,3,7]; - - if($chrID->{$chr1} ne $chrID->{$chr2}) { #if inter-chromosomal link here (because sv_type=all), - $nb_links++; - - $t[16]="INV_TRANSLOC" if($t[16] eq "REVERSE_SENSE"); - $t[16]="TRANSLOC" if($t[16] eq "NORMAL_SENSE"); - - $t[16].= "\t"; - $t[19].= "\t"; - - print $fh2 join("\t",@t)."\n"; - - if($record>=$warn){ - print LOG "-- $fchr : $warn links processed - $nb_links links kept\n"; - $warn+=10000; - } - next; - } - - my $ifRenv = $t[16]; - my $ifBalanced = "UNBAL"; - $ifBalanced = $t[18] if ($order_filtering); - - my $numberOfPairs = $t[6]; - my @positions1 = deleteBadOrderSensePairs(split (/,/,$t[14])); - my @positions2 = deleteBadOrderSensePairs(split (/,/,$t[15])); - - if ($ifBalanced eq "BAL") { - - if ($ifRenv eq "INV_TRANSLOC") { - $ifRenv = "INV_FRAGMT"; #for intrachromosomal inverted translocation is the same as inverted fragment - } - if ($ifRenv eq "NORMAL_SENSE") { - $ifRenv = "TRANSLOC"; - } - if ($ifRenv eq "REVERSE_SENSE") { - $ifRenv = "INV_FRAGMT"; #for intrachromosomal inverted translocation is the same as inverted fragment - } - $t[19].= "\t"; - - my $meanDistance = 0; - - for my $i (0..$numberOfPairs-1) { - $meanDistance += $positions2[$i]-$positions1[$i]; - } - $meanDistance /= $numberOfPairs; - - $t[16] = $ifRenv."\t".$meanDistance; - #dont touch the annotation. It should be already OK. - - } else { - #only for unbalanced - - my $ifoverlap=overlap($t[1],$t[2],$t[4],$t[5]); - - my $ends_sense_class = (deleteBadOrderSensePairs(split (/,/,$t[8])))[0]. - (deleteBadOrderSensePairs(split (/,/,$t[9])))[0]; - my $ends_order_class = (deleteBadOrderSensePairs(split (/,/,$t[10])))[0]. - (deleteBadOrderSensePairs(split (/,/,$t[11])))[0]; - - my $indel_type = $ifRenv; - - my $meanDistance = "N/A"; - - ($meanDistance, $indel_type) = checkIndel ($numberOfPairs, #identify insertion type for rearrangments without inversion, calculates distance between cluster - \@positions1, #assign N/A to $indel_type if unknown - \@positions2, - $ifRenv, - $ifoverlap, - $indel_sigma_threshold, - $dup_sigma_threshold, - $singleton_sigma_threshold, - $mu, - $sigma, - $ifBalanced, - $ends_sense_class, - $ends_order_class, - $mate_sense, - $diff_sense_ends, - ); - - #filtering of pairs with distance inconsistant with the SV - if ($ifRenv ne "REVERSE_SENSE") { - my $maxCoord1 =$chr->{$chrID->{$chr1}}->{length}; - my $maxCoord2 =$chr->{$chrID->{$chr2}}->{length}; - $meanDistance = recalc_t_usingInsertSizeInfo(\@t,$mu,$sigma,$meanDistance,$tag_length,$diff_sense_ends,$mate_sense, - $maxCoord1,$maxCoord2,$ends_sense_class,$ends_order_class,$nb_pairs_threshold,$order_filtering); - next if ($t[6] < $nb_pairs_threshold); - }else{ - $t[19].= "\t"; - } - $t[16] = $indel_type."\t".$meanDistance; - } - - $nb_links++; - - print $fh2 join("\t",@t)."\n"; - if($record>=$warn){ - print LOG "-- $fchr : $warn links processed - $nb_links links kept\n"; - $warn+=10000; - } - } - $fh->close; - $fh2->close; - - print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n"; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub checkIndel { - - my ($numberOfPairs, $positions1, $positions2, $ifRenv, $ifoverlap, $indel_sigma_threshold, $dup_sigma_threshold, $singleton_sigma_threshold, - $mu, $sigma, $ifBalanced,$ends_sense_class,$ends_order_class,$mate_sense,$diff_sense_ends) = @_; - - my $meanDistance = 0; - - for my $i (0..$numberOfPairs-1) { - $meanDistance += $positions2->[$i]-$positions1->[$i]; - } - $meanDistance /= $numberOfPairs; - - return ($meanDistance,"INV_DUPLI") if (($ifRenv eq "REVERSE_SENSE") && ($meanDistance<$mu+$dup_sigma_threshold*$sigma) ); - - return ($meanDistance,"INVERSION") if ($ifRenv eq "REVERSE_SENSE"); - - if($diff_sense_ends){ - return ($meanDistance, "LARGE_DUPLI") if ($ends_sense_class ne $mate_sense) && ($meanDistance>$mu+$dup_sigma_threshold*$sigma) ; - return ($meanDistance, "SINGLETON") if (($meanDistance<$mu-$singleton_sigma_threshold*$sigma) && $mate_sense eq "RF" && ($ends_sense_class eq inverseSense($mate_sense))); - }else{ - return ($meanDistance, "LARGE_DUPLI") if (($ends_sense_class eq $mate_sense) && ($ends_order_class eq "12") || ($ends_sense_class eq inverseSense($mate_sense)) && ($ends_order_class eq "21")) && - ($meanDistance>$mu+$dup_sigma_threshold*$sigma) ; - } - - return ($meanDistance, "SMALL_DUPLI") if (($meanDistance<$mu-$dup_sigma_threshold*$sigma) && $ifoverlap); - - return ($meanDistance, "DUPLICATION") if ($diff_sense_ends && ($ends_sense_class ne $mate_sense) && ($meanDistance<$mu-$dup_sigma_threshold*$sigma) ) ; - - return ($meanDistance, "INSERTION") if ($meanDistance<$mu -$indel_sigma_threshold*$sigma); - return ($meanDistance, "DELETION") if ($meanDistance>$mu+$indel_sigma_threshold*$sigma); - - return ($meanDistance, "UNDEFINED"); -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#sub reacalulate @t so that get rid of unconsistent pairs (unconsistent insert size ) -sub recalc_t_usingInsertSizeInfo { - my($t,$mu,$sigma,$meanDistance,$tag_length,$diff_sense_ends,$mate_sense,$maxCoord1,$maxCoord2,$ends_sense_class,$ends_order_class,$nb_pairs_threshold,$order_filtering) = @_; - - my @badPairs; - - my @positions1 = getAllEntries($t->[14]); - my @positions2 = getAllEntries($t->[15]); - - if ($meanDistance < $mu) { - for my $i (0..scalar(@positions1)-1) { - if (substr($positions2[$i],-1,1) ne '$' && substr($positions2[$i],-1,1) ne '*' && $positions2[$i]-$positions1[$i]>=$mu) { - push(@badPairs,$i); - } - } - } else { - for my $i (0..scalar(@positions1)-1) { - if (substr($positions2[$i],-1,1) ne '$' && substr($positions2[$i],-1,1) ne '*' && $positions2[$i]-$positions1[$i]<=$mu) { - push(@badPairs,$i); - } - } - } - - if (scalar (@badPairs)>0) { - #print join("\t",@badPairs).": ".join("\t",@t)."\n"; - #remove these inconsistant links - $t->[6] -= scalar(@badPairs); #numberOfPairs - return if ($t->[6] < $nb_pairs_threshold); - - $t->[7] = mark_values(\@badPairs, $t->[7]); - $t->[8] = mark_values(\@badPairs, $t->[8]); - $t->[9] = mark_values(\@badPairs, $t->[9]); - $t->[10] = mark_values(\@badPairs, $t->[10]); - $t->[11] = mark_values(\@badPairs, $t->[11]); - - $t->[12] = mark_indexes(\@badPairs, $t->[12]); - $t->[13] = mark_indexes(\@badPairs, $t->[13]); - - $t->[14] = mark_values(\@badPairs, $t->[14]); - $t->[15] = mark_values(\@badPairs, $t->[15]); - $t->[19] = recalculate_ratio($t->[6],$t->[19]) if ($order_filtering); #add the second ratio - $t->[17] = recalculate_ratio($t->[6],$t->[17]) unless ($order_filtering); - ($t->[1],$t->[2]) = recalculate_boundaries($t->[14],$t->[8],$t->[10],$tag_length); - ($t->[4],$t->[5]) = recalculate_boundaries($t->[15],$t->[9],$t->[11],$tag_length); - - #recalc breakpoints: - my $quant001 = 3.090232; - my $maxFragmentLength = &floor($quant001 * $sigma + $mu); - $t->[20] = recalc_breakpoints($mate_sense,$maxCoord1,$t->[14],substr($ends_sense_class,0,1),substr($ends_order_class,0,1),$t->[1],$t->[2],$maxFragmentLength,$diff_sense_ends ) if ($order_filtering); - $t->[21] = recalc_breakpoints($mate_sense,$maxCoord2,$t->[15],substr($ends_sense_class,1,1),substr($ends_order_class,1,1),$t->[4],$t->[5],$maxFragmentLength,$diff_sense_ends ) if ($order_filtering); - #recalc total ratio - $t->[22] = $t->[6] / $t->[23] if ($order_filtering); - $t->[18] = $t->[6] / $t->[19] unless ($order_filtering); - - @positions1 = deleteBadOrderSensePairs(split (/,/,$t->[14])); - @positions2 = deleteBadOrderSensePairs(split (/,/,$t->[15])); - - $meanDistance = 0; - - for my $i (0..scalar(@positions1)-1) { - $meanDistance += $positions2[$i]-$positions1[$i]; - } - $meanDistance /= scalar(@positions1); - - } else { - $t->[17] = recalculate_ratio((split(/\//,$t->[17]))[0],$t->[17]) unless ($order_filtering); - $t->[19] = recalculate_ratio((split(/\//,$t->[19]))[0],$t->[19]) if ($order_filtering); - - } #nothing has been filtered - return $meanDistance; -} - -sub recalculate_ratio { - my ($left, $ratio) = @_; - my @elements = split (/\//,$ratio); - $elements[1]= $elements[0]; - $elements[0]=$left; - return $ratio."\t".join("/",@elements); -} - -sub recalc_breakpoints { - my ($mate_sense,$maxCoord,$startString,$strand,$firstEndOrder,$coord_start_chr,$coord_end_chr,$maxFragmentLength,$diff_sense_ends ) = @_; - my $break_pont_chr; - - my $leftLetterOk = substr($mate_sense, 0, 1); #R - my $rightLetterOk = substr($mate_sense, 1, 1); #F - - - my @positions = deleteBadOrderSensePairs(split (/,/,$startString)); - - unless ($diff_sense_ends) { - $break_pont_chr = (($strand eq 'R' && $firstEndOrder == 2) || ($strand eq 'F' && $firstEndOrder == 1))?'('.$coord_end_chr.','.min(($coord_start_chr+$maxFragmentLength),$maxCoord).')':'('.max(($coord_end_chr-$maxFragmentLength),1).','.$coord_start_chr.')'; - } else { - $break_pont_chr = ($strand eq $leftLetterOk)?'('.$coord_end_chr.','.min(($coord_start_chr+$maxFragmentLength),$maxCoord).')':'('.max(($coord_end_chr-$maxFragmentLength),1).','.$coord_start_chr.')'; - } - return $break_pont_chr; -} -sub recalculate_boundaries { - my ($startString,$senseString,$endsOrderString,$tag_length) = @_; - my @positions = deleteBadOrderSensePairs(split (/,/,$startString)); - my @strands = deleteBadOrderSensePairs(split (/,/,$senseString)); - my @ends_orders = deleteBadOrderSensePairs(split (/,/,$endsOrderString)); - my @ends; getEnds(\@ends,\@positions,\@strands,\@ends_orders,$tag_length); - my $coord_start_cluster = min(min(@positions),min(@ends)); - my $coord_end_cluster = max(max(@positions),max(@ends)); - return ($coord_start_cluster,$coord_end_cluster); -} - -sub remove_indexes { - my ($bads, $string) = @_; - my @elements = deleteBadOrderSensePairs(split (/,/,$string)); - for my $i (reverse sort %{$bads}) { - delete $elements[$i]; - } - return "(".join(",",@elements).")"; -} -##add @ to to elements -sub mark_values { - my ($bads, $string) = @_; - my @elements = getAllEntries($string); - for my $i (@{$bads}) { - $elements[$i] .= "@"; - } - return "(".join(",",@elements).")"; -} -##add @ to to indexes -sub mark_indexes { - my ($bads, $string) = @_; - my @elements = getAllEntries($string); - for my $i ((0..scalar(@elements)-1)) { - for my $j (@{$bads}) { - $elements[$i] .= "@" if ($elements[$i] eq ($j+1)); - } - } - - return "(".join(",",@elements).")"; -} - -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub redraw { - - my ($type,$table,$secondTable,$badInFRSense,$ifBalanced,$arr) = @_; - - my $out; - my @first_arr; - if ($ifBalanced eq 'BAL') { - my @second_arr; - my $lastPushed = 1; - if ($type == 1) { - for my $i (0 .. scalar(@{$arr})-1) { - if (exists ($table->{$i})) { - push(@first_arr,$arr->[$i]); - $lastPushed = 1; - }elsif (exists ($secondTable->{$i})) { - push(@second_arr,$arr->[$i]); - $lastPushed = 2; - } elsif ($lastPushed == 1) { - if (exists ($badInFRSense->{$i})) { - push(@first_arr,$arr->[$i]."\$"); - }else { - push(@first_arr,$arr->[$i]."*"); - } - } elsif ($lastPushed == 2) { - if (exists ($badInFRSense->{$i})) { - push(@second_arr,$arr->[$i]."\$"); - }else { - push(@second_arr,$arr->[$i]."*"); - } - } else {print "Error!";exit;} - } - } else { - for my $i (@{$arr}) { - if (exists ($table->{$i-1})) { - push(@first_arr,$i); - $lastPushed = 1; - }elsif (exists ($secondTable->{$i-1})) { - push(@second_arr,$i); - $lastPushed = 2; - } elsif ($lastPushed == 1) { - if (exists ($badInFRSense->{$i-1})) { - push(@first_arr,$i."\$"); - }else { - push(@first_arr,$i."*"); - } - } elsif ($lastPushed == 2) { - if (exists ($badInFRSense->{$i-1})) { - push(@second_arr,$i."\$"); - }else { - push(@second_arr,$i."*"); - } - } else {print "Error!";exit;} - } - } - $out = '('.join(",",@first_arr).'),('.join(",",@second_arr).')'; - } - else { - if ($type == 1) { - for my $i (0 .. scalar(@{$arr})-1) { - if (exists ($table->{$i})) { - push(@first_arr,$arr->[$i]); - } else { - if (exists ($badInFRSense->{$i})) { - push(@first_arr,$arr->[$i]."\$"); - }else { - push(@first_arr,$arr->[$i]."*"); - } - } - } - } else { - for my $i (@{$arr}) { - if (exists ($table->{$i-1})) { - push(@first_arr,$i); - } else { - if (exists ($badInFRSense->{$i-1})) { - push(@first_arr,$i."\$"); - }else { - push(@first_arr,$i."*"); - } - } - } - } - $out = '('.join(",",@first_arr).')'; - } - return $out; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub check { - - my $table = $_[0]; - my $bad = 'OK'; - my $max = 0; - for my $i (sort {$a<=>$b} keys %{$table}) { - unless ($table->{$i}->{nonAdeq} == 0) { - if ($max<$table->{$i}->{nonAdeq}) { - $max=$table->{$i}->{nonAdeq}; - $bad = $i; - } - } - } - return $bad; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub reversed { - - my ($i,$j,$ifRenv,$positions) = @_; - if (($ifRenv eq 'REVERSE_SENSE' && $positions->[$i]<$positions->[$j]) || ($ifRenv ne 'REVERSE_SENSE' && $positions->[$i]>$positions->[$j])){ - return 1; - } - return 0; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub remove { - - my ($bad,$table) = @_; - for my $i (sort {$a<=>$b} keys %{$table}) { - if ($bad == $i) { - delete($table->{$i});; - } else { - if (exists($table->{$i}->{$bad})) { - delete($table->{$i}->{$bad}); - $table->{$i}->{nonAdeq}--; - } - } - } -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub findBadInRFSenseSOLiDSolexa { #choose maximum: FFFFs or RRRRs - - my ($strand,$ends_order,$mate_sense,@keysLeft) = @_; - - my $leftLetterOk = substr($mate_sense, 0, 1); #R - my $rightLetterOk = substr($mate_sense, 1, 1); #F - - my (@standardArray); - if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs - $leftLetterOk = 'R'; - $rightLetterOk = 'F'; - @standardArray = translateSolidToRF($strand,$ends_order); - } else { - @standardArray = @{$strand}; - } - - my $ifR = 0; - my @Rs; - - for my $i (@keysLeft) { - if ($standardArray[$i] eq $leftLetterOk) { - $ifR++; - push(@Rs,$i); - } - } - - - my $ifF = 0; - my @Fs; - - for my $i (@keysLeft) { - if ($standardArray[$i] eq $rightLetterOk) { - $ifF++; - push(@Fs,$i); - } - } - - if($ifR>=$ifF) { - return @Fs; - } - return @Rs; -} - -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub findBadInFRSenseSOLiDSolexa { #should work both for SOLiD and Solexa - - my ($strand1,$strand2,$ends_order1,$ends_order2,$order1,$order2) = ($_[0],$_[1],$_[2],$_[3],$_[4],$_[5]); - my $mate_sense = $_[6]; - - my $leftLetterOk = substr($mate_sense, 0, 1); #R - my $rightLetterOk = substr($mate_sense, 1, 1); #F - - my (@standardArray1,@standardArray2); - - if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs - $leftLetterOk = 'R'; - $rightLetterOk = 'F'; - @standardArray1 = translateSolidToRF($strand1,$ends_order1); - my @arr = getOrderedStrands($strand2,$order2); - my @ends2 = getOrderedStrands($ends_order2,$order2); - @standardArray2 = translateSolidToRF(\@arr,\@ends2); - - } else { - @standardArray1 = @{$strand1}; - @standardArray2 = getOrderedStrands($strand2,$order2); - } - - #we will try 4 possibilities, 2 for each end of the link: RFRR-FFF->RFFFF , RFRR-FFF->RRRFFF - - #for the first end: - - my @array = @standardArray1; - my %badInFRSense1; - for my $i (1..scalar (@array)-1){ # FRFRFFFF -> FFFFFF and RRFRFRFFFF -> RRFFFFFF - if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { - $badInFRSense1{$i}=1; - $array[$i] = $rightLetterOk; - } - } - my $numberRRRFFF_or_FFF_1 = scalar(@array)-scalar(keys %badInFRSense1); - @array = @standardArray1; - my %badInFRSense0; - for my $i (reverse(1..scalar (@array)-1)){ # FRFRFFFFRR -> FFFFFFRR - if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { - $badInFRSense0{$i-1}=1; - $array[$i-1] = $leftLetterOk; - - } - } - my $numberRRF1 = scalar(@array)-scalar(keys %badInFRSense0); - - #for the second end: - @array = @standardArray2; - - my %badInFRSense3; - for my $i (1..scalar(@array)-1){ - if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { - $badInFRSense3{$order2->[$i]}=1; - $array[$i] = $rightLetterOk; - } - } - my $numberRRRFFF_or_FFF_2 = scalar(@array)-scalar(keys %badInFRSense3); - - @array = @standardArray2; - my %badInFRSense5; - for my $i (reverse(1..scalar (@array)-1)){ # FRFRFFFF -> FFFFFF - if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { - $badInFRSense5{$i-1}=1; - $array[$i-1] = $leftLetterOk; - } - } - my $numberRRF2 = scalar(@array)-scalar(keys %badInFRSense5); - - if ($numberRRF1>=$numberRRRFFF_or_FFF_1 && $numberRRF1 >= $numberRRRFFF_or_FFF_2 && $numberRRF1 >=$numberRRF2) { - return (1,%badInFRSense0); - } - - if ($numberRRRFFF_or_FFF_1 >=$numberRRF1 && $numberRRRFFF_or_FFF_1 >= $numberRRRFFF_or_FFF_2 && $numberRRRFFF_or_FFF_1 >= $numberRRF2) { - return (1,%badInFRSense1); - } - - if ($numberRRRFFF_or_FFF_2 >= $numberRRF1 && $numberRRRFFF_or_FFF_2 >= $numberRRRFFF_or_FFF_1 && $numberRRRFFF_or_FFF_2 >=$numberRRF2) { - return (2,%badInFRSense3); - } - - if ($numberRRF2 >= $numberRRF1 && $numberRRF2 >= $numberRRRFFF_or_FFF_1 && $numberRRF2 >= $numberRRRFFF_or_FFF_2 ) { - return (2,%badInFRSense5); - } - - #should not get here: - print STDERR "Error in findBadInFRSenseSOLiDSolexa()!\n"; - return (1,%badInFRSense1); -} - -sub getOrderedStrands { - my ($strand,$order) = ($_[0],$_[1]); - my @arr; - for my $i (0..scalar(@{$strand})-1) { - push(@arr,$strand->[$order->[$i]-1]); - } - return @arr; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub checkClusters { - - my ($ifRenv,$coord_start_chr1_cluster1,$coord_start_chr1_cluster2,$coord_start_chr2_cluster1,$coord_start_chr2_cluster2) = @_; - if ($ifRenv eq 'REVERSE_SENSE') { - if ($coord_start_chr1_cluster1 <= $coord_start_chr1_cluster2) { - return ($coord_start_chr2_cluster1 <= $coord_start_chr2_cluster2)?1:0; - } - return ($coord_start_chr2_cluster1 >= $coord_start_chr2_cluster2)?1:0; - } - #if NORM - if ($coord_start_chr1_cluster1 <= $coord_start_chr1_cluster2) { - return ($coord_start_chr2_cluster1 >= $coord_start_chr2_cluster2)?1:0; - } - return ($coord_start_chr2_cluster1 <= $coord_start_chr2_cluster2)?1:0; -} - -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub translateSolidToRF { - my ($strandArr,$ends_orderArr)=@_; - my @array; - for my $i (0..scalar(@{$strandArr})-1) { - if ($ends_orderArr->[$i]==1 && $strandArr->[$i] eq 'F') { - push(@array,'F'); - } - if ($ends_orderArr->[$i]==2 && $strandArr->[$i] eq 'F') { - push(@array,'R'); - } - if ($ends_orderArr->[$i]==1 && $strandArr->[$i] eq 'R') { - push(@array,'R'); - } - if ($ends_orderArr->[$i]==2 && $strandArr->[$i] eq 'R') { - push(@array,'F'); - } - } - return @array; -} - -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#convert the links file to the circos format -sub links2segdup{ - - my($id,$color_code,$links_file,$segdup_file)=@_; - - print LOG "# Converting to the circos format...\n"; - - tie (my %hcolor,'Tie::IxHash'); #color-code hash table - foreach my $col (keys %{$color_code}){ - my ($min_links,$max_links)=split(",",$color_code->{$col}); - $hcolor{$col}=[$min_links,$max_links]; - } - - open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n"; - open SEGDUP, ">$segdup_file" or die "$0: can't write in the output: $segdup_file :$!\n"; - - my $index=1; - while(){ - - my ($chr1,$start1,$end1,$chr2,$start2,$end2,$count)=(split)[0,1,2,3,4,5,6]; - - my $color=getColor($count,\%hcolor,"circos"); #get the color-code according the number of links - - print SEGDUP "$index\t$id$chr1\t$start1\t$end1\tcolor=$color\n". #circos output - "$index\t$id$chr2\t$start2\t$end2\tcolor=$color\n"; - $index++; - } - - close LINKS; - close SEGDUP; - print LOG "-- output created: $segdup_file\n"; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#convert the links file to the bedPE format for BEDTools usage -sub links2bedPElinksfile{ - - my ($sample,$links_file,$bedpe_file)=@_; - - open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n"; - open BEDPE, ">$bedpe_file" or die "$0: can't write in the output: $bedpe_file :$!\n"; - - my $nb_links=1; - - while(){ - - chomp; - my @t=split("\t",$_); - my ($chr1,$start1,$end1,$chr2,$start2,$end2)=splice(@t,0,6); - my $type=($chr1 eq $chr2)? "INTRA":"INTER"; - $type.="_".$t[10]; - - $start1--; $start2--; - - print BEDPE "$chr1\t$start1\t$end1\t$chr2\t$start2\t$end2". - "\t$sample"."_link$nb_links\t$type\t.\t.". - "\t".join("|",@t)."\n"; - - $nb_links++; - } - - close LINKS; - close BEDPE; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub bedPElinks2linksfile{ - - my ($bedpe_file,$links_file)=@_; - - open BEDPE, "<$bedpe_file" or die "$0: can't open: $bedpe_file :$!\n"; - open LINKS, ">$links_file" or die "$0: can't write in the output $links_file :$!\n"; - - while(){ - - chomp; - my $sample=(split("_",(split("\t",$_))[6]))[0]; - my @t1=(split("\t",$_))[0,1,2,3,4,5]; - my @t2=split(/\|/,(split("\t",$_))[10]); - push(@t2,$sample); - - print LINKS join("\t",@t1)."\t".join("\t",@t2)."\n"; - - } - close BEDPE; - close LINKS; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#convert the links file to the bed format -sub links2bedfile{ - - my ($tag_length,$color_code,$links_file,$bed_file)=@_; - - print LOG "# Converting to the bed format...\n"; - - my $compare=1; - if($links_file!~/compared$/){ - $compare=0; - $tag_length->{none}->{1}=$tag_length->{1}; - $tag_length->{none}->{2}=$tag_length->{2}; - } - - #color-code hash table - tie (my %hcolor,'Tie::IxHash'); - my %color_order; - my $n=1; - foreach my $col (keys %{$color_code}){ - my ($min_links,$max_links)=split(",",$color_code->{$col}); - $hcolor{$col}=[$min_links,$max_links]; - $color_order{$col}=$n; - $n++; - } - - my %pair; - my %pt; - $n=1; - open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n"; - - my %str=( "F"=>"+", "R"=>"-" ); - - while(){ - - my @t=split; - my $sample=($compare)? pop(@t):"none"; - - my $chr1=$t[0]; - my $chr2=$t[3]; - $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i); - $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i); - my $same_chr=($chr1 eq $chr2)? 1:0; - - my $count=$t[6]; - my $color=getColor($count,\%hcolor,"bed"); - - my @pairs=deleteBadOrderSensePairs(split(",",$t[7])); - my @strand1=deleteBadOrderSensePairs(split(",",$t[8])); - my @strand2=deleteBadOrderSensePairs(split(",",$t[9])); - my @ends_order1=deleteBadOrderSensePairs(split(",",$t[10])); - my @ends_order2=deleteBadOrderSensePairs(split(",",$t[11])); - my @position1=deleteBadOrderSensePairs(split(",",$t[14])); - my @position2=deleteBadOrderSensePairs(split(",",$t[15])); - my @start1; my @end1; getCoordswithLeftMost(\@start1,\@end1,\@position1,\@strand1,\@ends_order1,$tag_length->{$sample}); - my @start2; my @end2; getCoordswithLeftMost(\@start2,\@end2,\@position2,\@strand2,\@ends_order2,$tag_length->{$sample}); - - - for my $p (0..$#pairs){ - - if (!exists $pair{$pairs[$p]}){ - - if($same_chr){ - - $pair{$pairs[$p]}->{0}=[ $chr1, $start1[$p]-1, $end2[$p], $pairs[$p], 0, $str{$strand1[$p]}, - $start1[$p]-1, $end2[$p], $color, - 2, $tag_length->{$sample}->{$ends_order1[$p]}.",".$tag_length->{$sample}->{$ends_order2[$p]}, "0,".($start2[$p]-$start1[$p]) ]; - $pt{$n}=$pair{$pairs[$p]}->{0}; - $n++; - - }else{ - - $pair{$pairs[$p]}->{1}=[ $chr1, $start1[$p]-1, $end1[$p] , $pairs[$p]."/1", 0, $str{$strand1[$p]}, - $start1[$p]-1, $end1[$p], $color, - 1, $tag_length->{$sample}->{$ends_order1[$p]}, 0]; - $pt{$n}=$pair{$pairs[$p]}->{1}; - $n++; - - - $pair{$pairs[$p]}->{2}=[ $chr2, $start2[$p]-1, $end2[$p], $pairs[$p]."/2", 0, $str{$strand2[$p]}, - $start2[$p]-1, $end2[$p], $color, - 1, $tag_length->{$sample}->{$ends_order2[$p]}, 0]; - $pt{$n}=$pair{$pairs[$p]}->{2}; - $n++; - } - }else{ - - if($same_chr){ - ${$pair{$pairs[$p]}->{0}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{0}}[8]}); - }else{ - ${$pair{$pairs[$p]}->{1}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{1}}[8]}); - ${$pair{$pairs[$p]}->{2}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{2}}[8]}); - } - } - } - } - close LINKS; - - my $nb_pairs=$n-1; - - open BED, ">$bed_file" or die "$0: can't write in the output: $bed_file :$!\n"; - print BED "track name=\"$bed_file\" description=\"mate pairs involved in links\" ". - "visibility=2 itemRgb=\"On\"\n"; - - for my $i (1..$nb_pairs){ - print BED join("\t",@{$pt{$i}})."\n"; - } - - close BED; - - print LOG "-- output created: $bed_file\n"; - - undef %pair; - undef %pt; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub deleteBadOrderSensePairs{ - - my (@tab)=@_; - my @tab2; - - foreach my $v (@tab){ - - $v=~s/[\(\)]//g; - push(@tab2,$v) if($v!~/[\$\*\@]$/); - } - return @tab2; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub getAllEntries{ - my (@tab)=split (/,/,$_[0]); - my @tab2; - - foreach my $v (@tab){ - - $v=~s/[\(\)]//g; - push(@tab2,$v); - } - return @tab2; -}#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub getAllEntriesWOspecialChar{ - my (@tab)=split (/,/,$_[0]); - my @tab2; - - foreach my $v (@tab){ - - $v=~s/[\(\)\$\*\@]//g; - push(@tab2,$v); - } - return @tab2; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub links2SVfile{ - - my($links_file,$sv_file)=@_; - - print LOG "# Converting to the sv output table...\n"; - open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n"; - open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n"; - - my @header=qw(chr_type SV_type BAL_type chromosome1 start1-end1 average_dist - chromosome2 start2-end2 nb_pairs score_strand_filtering score_order_filtering score_insert_size_filtering - final_score breakpoint1_start1-end1 breakpoint2_start2-end2); - - my $nb_links=0; - - while (){ - - my @t=split; - my @sv=(); - my $sv_type="-"; - my $strand_ratio="-"; - my $eq_ratio="-"; - my $eq_type="-"; - my $insert_ratio="-"; - my $link="-"; - my ($bk1, $bk2)=("-","-"); - my $score="-"; - - my ($chr1,$start1,$end1)=($t[0],$t[1],$t[2]); - my ($chr2,$start2,$end2)=($t[3],$t[4],$t[5]); - my $nb_pairs=$t[6]; - $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i); - $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i); - my $chr_type=($chr1 eq $chr2)? "INTRA":"INTER"; - - #if strand filtering - if (defined $t[16]){ - #if inter-chr link - $sv_type=$t[16]; - if(defined $t[17] && $t[17]=~/^(\d+)\/(\d+)$/){ - $strand_ratio=floor($1/$2*100)."%"; - $score=$t[18]; - } - if(defined $t[18] && $t[18]=~/^(\d+)\/(\d+)$/){ - #if intra-chr link with insert size filtering - $strand_ratio=floor($1/$2*100)."%"; - $link=floor($t[17]); - if($sv_type!~/^INV/){ - $insert_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/); - $score=$t[20]; - }else{ - $score=$t[19]; - } - } - } - - if(defined $t[18] && ($t[18] eq "UNBAL" || $t[18] eq "BAL")){ - - #if strand and order filtering only and/or interchr link - $eq_type=$t[18]; - $eq_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/); - ($bk1, $bk2)=($t[20],$t[21]); - foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;} - $score=$t[22]; - - }elsif(defined $t[19] && ($t[19] eq "UNBAL" || $t[19] eq "BAL")){ - - #if all three filtering - $link=floor($t[17]); - $eq_type=$t[19]; - $eq_ratio=floor($1/$2*100)."%" if($t[20]=~/^(\d+)\/(\d+)$/); - - if(defined $t[21] && $t[21]=~/^(\d+)\/(\d+)$/){ - $insert_ratio=floor($1/$2*100)."%"; - ($bk1, $bk2)=($t[22],$t[23]); - $score=$t[24]; - - }else{ - ($bk1, $bk2)=($t[21],$t[22]); - $score=$t[23]; - } - foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;} - - } - - - push(@sv, $chr_type, $sv_type,$eq_type); - push(@sv,"$chr1\t$start1-$end1"); - push(@sv, $link); - push(@sv,"$chr2\t$start2-$end2", - $nb_pairs,$strand_ratio,$eq_ratio,$insert_ratio, decimal($score,4), $bk1, $bk2); - - - print SV join("\t",@sv)."\n"; - } - - close LINKS; - close SV; - - system "sort -k 9,9nr -k 13,13nr $sv_file > $sv_file.sorted"; - - open SV, "<".$sv_file.".sorted" or die "$0: can't open in the output: $sv_file".".sorted :$!\n"; - my @links=; - close SV; - - open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n"; - - print SV join("\t",@header)."\n"; - print SV @links; - close SV; - - unlink($sv_file.".sorted"); - - print LOG "-- output created: $sv_file\n"; - -} -#------------------------------------------------------------------------------# -sub densityCalculation{ - - my ($chr,$chrID,$file,$tag_length,$window_dist,$step,$mates_file,$mates_file_ref,$density_file,$input_format)=@_; - - my @sfile=split(/\./,$$mates_file[$file]); - my $fchr=$sfile[$#sfile]; - - my $fh = new FileHandle; - - my %density; - my %density_ref; - my @ratio; - my ($cov,$cov_ref); - - #FREQUENCY CALCULATION PROCEDURE - print LOG "# $fchr : Frequency calculation procedure...\n"; - &FreqCalculation(\%density,$chr,$chrID,$tag_length,$window_dist,$step,$$mates_file[$file],$input_format); - &FreqCalculation(\%density_ref,$chr,$chrID,$tag_length,$window_dist,$step,$$mates_file_ref[$file],$input_format); - - #MAKING RATIO AND OUTPUT - print LOG "\# Ratio calculation procedure...\n"; - $density_file=~s/\/mates\//\/density\//; - $fh->open(">".$density_file) or die "$0: can't write in the output ".$density_file." :$!\n"; - - foreach my $k (1..$chr->{nb_chrs}){ - foreach my $frag (1..$chr->{$k}->{nb_frag}){ - - @ratio= ($chr->{$k}->{name}, - (${$chr->{$k}->{$frag}}[0]+1), - (${$chr->{$k}->{$frag}}[1]+1)); - - $cov=(exists $density{$k}{$frag}->{count})? $density{$k}{$frag}->{count}:0; - $cov_ref=(exists $density_ref{$k}{$frag}->{count})? $density_ref{$k}{$frag}->{count}:0; - - push(@ratio,$cov,$cov_ref); - push(@ratio,log($cov/$cov_ref)) if($cov && $cov_ref); - push(@ratio,-log($cov_ref+1)) if(!$cov && $cov_ref); - push(@ratio,log($cov+1)) if($cov && !$cov_ref); - next if(!$cov && !$cov_ref); - - print $fh join("\t",@ratio)."\n"; - } - } - - $fh->close; - print LOG "-- output created: $density_file\n"; - - undef %density; - undef %density_ref; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub FreqCalculation{ - - my ($density,$chr,$chrID,$tag_length,$window_dist,$step,$mates_file,$input_format) = @_; - - my @sfile=split(/\./,$mates_file); - my $fchr=$sfile[$#sfile]; - my $fh = new FileHandle; - - my $nb_windows=0; - my $warn=100000; - my $record=0; - my %pair; - - my ($sumX,$sumX2) = (0,0); - - print LOG "\# Frequency calculation for $mates_file...\n"; - - if ($mates_file =~ /.gz$/) { - $fh->open("gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; - }elsif($mates_file =~ /.bam$/){ - o$fh->open("$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY - }else{ - $fh->open("<".$mates_file) or die "$0: can't open ".$mates_file.":$!\n"; - } - - while(<$fh>){ - - my @t=split; - my $mate=$t[0]; - - my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1, $end_order_read2,); - - next if(exists $pair{$mate}); - - next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2,\$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length)); - - next unless (exists $chrID->{$chr_read1} || exists $chrID->{$chr_read2}); - ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); - - $pair{$mate}=undef; - $record++; - - my ($coord_start_read1,$coord_end_read1, $coord_start_read2,$coord_end_read2); - - recupCoords($firstbase_read1,\$coord_start_read1,\$coord_end_read1,$tag_length->{$end_order_read1},$input_format); - recupCoords($firstbase_read2,\$coord_start_read2,\$coord_end_read2,$tag_length->{$end_order_read2},$input_format); - - my $length = abs($coord_start_read1-$coord_start_read2); - $sumX += $length; #add to sum and sum^2 for mean and variance calculation - $sumX2 += $length*$length; - - for(my $i=1;$i<=$chr->{$chr_read1}->{'nb_frag'};$i++){ - - if (abs ($coord_start_read1-${$chr->{$chr_read1}->{$i}}[0]) <= $window_dist){ - - &addToDensity($density,$chr_read1,$i,\$nb_windows) - if(overlap($coord_start_read1,$coord_end_read2,${$chr->{$chr_read1}->{$i}}[0],${$chr->{$chr_read1}->{$i}}[1])); - - }else{ - - $i=getNextFrag($coord_start_read1,$i,${$chr->{$chr_read1}->{$i}}[0],$chr->{$chr_read1}->{nb_frag},$window_dist,$step); - } - } - - if($record>=$warn){ - print LOG "-- $warn mate-pairs analysed - $nb_windows points created\n"; - $warn+=100000; - } - } - $fh->close; - - print LOG "-- $fchr : Total : $record mate-pairs analysed - $nb_windows points created\n"; - - if($record>0){ - - my $mu = $sumX/$record; - my $sigma = sqrt($sumX2/$record - $mu*$mu); - print LOG "-- $fchr : mu length = $mu, sigma length = $sigma\n"; - } - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub ratio2segdup{ - - my($id,$density_file,$segdup_file)=@_; - - print LOG "# Converting to circos format...\n"; - - open RATIO, "<$density_file" or die "$0: can't open $density_file :$!\n"; - open SEGDUP, ">$segdup_file" or die "$0: can't write in the output: $segdup_file :$!\n"; - - while(){ - chomp; - my ($chr1,$start1,$end1,$ratio)=(split /\t/)[0,1,2,5]; - print SEGDUP "$id$chr1\t$start1\t$end1\t$ratio\n"; - } - - close RATIO; - close SEGDUP; - print LOG "-- output created: $segdup_file\n"; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub ratio2bedfile{ - - my($density_file,$bed_file)=@_; - - print LOG "# Converting to bedGraph format...\n"; - - open RATIO, "<$density_file" or die "$0: can't open $density_file :$!\n"; - open BED, ">$bed_file" or die "$0: can't write in the output: $bed_file :$!\n"; - print BED "track type=bedGraph name=\"$bed_file\" description=\"log ratios for cnv detection\" ". - "visibility=2 color=255,0,0 alwaysZero=\"On\"\n"; - - while(){ - chomp; - my ($chr1,$start1,$end1,$ratio)=(split /\t/)[0,1,2,5]; - $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/); - print BED "$chr1\t".($start1-1)."\t$end1\t$ratio\n"; - } - - close RATIO; - close BED; - print LOG "-- output created: $bed_file\n"; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub inverseSense{ - - my $mate_sense=$_[0]; - my %reverse=( 'F' => 'R' , 'R' => 'F' , - 'FF' => 'RR', 'RR' => 'FF', - 'FR' => 'RF', 'RF' => 'FR'); - return $reverse{$mate_sense}; -} - -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub getNextFrag{ - - my ($read_start,$frag_num,$frag_start,$frag_last,$window_dist,$step)=@_; - - my $how_far = $read_start-$frag_start; - my $nb_windows_toskip; - - if($how_far>0){ - - $nb_windows_toskip=($how_far/$step)-($window_dist/$step); - $nb_windows_toskip=~ s/\..*//; - $nb_windows_toskip=0 if($nb_windows_toskip<0); - return ($frag_num + $nb_windows_toskip); - } - return $frag_last; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub getColor{ - - my($count,$hcolor,$format)=@_; - for my $col ( keys % { $hcolor} ) { - return $col if($count>=$hcolor->{$col}->[0] && $count<=$hcolor->{$col}->[1]); - } - return "white" if($format eq "circos"); - return "255,255,255" if($format eq "bed"); -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub recupCoords{ - - my($c_hit,$cs_hit,$ce_hit,$tag_length,$input_format)=@_; - my $strand = 'F'; - - if ($c_hit=~s/^\-//) { - $strand='R'; - $$cs_hit=$c_hit; - $$ce_hit=$c_hit-($tag_length-1); - }else{ - $$cs_hit=$c_hit; - $$ce_hit=$c_hit+($tag_length-1); - } - return $strand; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub overlap { - my($cs_hit,$ce_hit,$cs_region,$ce_region)=@_; - if( (($cs_hit < $cs_region) && ($ce_hit < $cs_region )) || (($cs_hit > $ce_region) && ($ce_hit > $ce_region )) ) { - return 0; - } - return 1; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub makeLink { - - my ($link,$chr1,$frag1,$chr2,$frag2,$mt,$nb)=@_; - - if($chr1>$chr2){ - ($chr1,$chr2)= ($chr2,$chr1); - ($frag1,$frag2)= ($frag2,$frag1); - } - - if($chr1 == $chr2){ - if($frag1>$frag2){ - ($frag1,$frag2)= ($frag2,$frag1); - } - } - - if(!exists $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}){ - $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}=$mt; - $$nb++; - }elsif($link->{$chr1}->{$chr2}->{$frag1}->{$frag2}!~/(^|,)$mt(,|$)/){ - $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}.=",$mt"; - } -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#fonction of adding the read to the density profile -sub addToDensity { - - my ($density,$chr1,$frag1,$nb)=@_; - - if(!exists $density->{$chr1}->{$frag1}->{count}){ - $density->{$chr1}->{$frag1}->{count}=1; - $$nb++; - }else{ - $density->{$chr1}->{$frag1}->{count}++; - } -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub floor { - my $nb = $_[0]; - $nb=~ s/\..*//; - return $nb; -} -#------------------------------------------------------------------------------# -sub decimal{ - - my $num=shift; - my $digs_to_cut=shift; - - $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/); - - return $num; -} - -#------------------------------------------------------------------------------# -sub max { - - my($max) = shift(@_); - foreach my $temp (@_) { - $max = $temp if $temp > $max; - } - return($max); -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub min { - - my($min) = shift(@_); - foreach my $temp (@_) { - $min = $temp if $temp < $min; - } - return($min); -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub sortTablebyIndex{ - my ($tab1,$tab2)=@_; - my @tab3; - - foreach my $i (@$tab1){ - $tab3[$i]=$$tab2[$$tab1[$i]]; - } - return @tab3; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub round { - my $number = shift || 0; - my $dec = 10 ** (shift || 0); - return int( $dec * $number + .5 * ($number <=> 0)) / $dec; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub getUniqueTable{ - - my (@tab)=@_; - my (%saw,@out)=(); - undef %saw; - return sort(grep(!$saw{$_}++, @tab)); -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub catFiles { - - unlink("$_[1]") if(exists $_[1]); - system qq( cat "$_" >> "$_[1]" ) for @{$_[0]}; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#check if the configuration file is correct -sub validateconfiguration{ - - my %conf=%{$_[0]}; - my $list_prgs="@ARGV"; - - my @general_params=qw(input_format mates_orientation read1_length read2_length mates_file cmap_file); - my @detection_params=qw(split_mate_file window_size step_length split_mate_file); - my @filtering_params=qw(split_link_file nb_pairs_threshold strand_filtering split_link_file); - my @circos_params=qw(organism_id colorcode); - my @bed_params=qw(colorcode); - my @compare_params=qw(list_samples file_suffix); - - foreach my $dir ($conf{general}{output_dir},$conf{general}{tmp_dir}){ - - unless (defined($dir)) { - $dir = "."; - } - unless (-d $dir){ - mkdir $dir or die; - } - $dir.="/" if($dir!~/\/$/); - } - - unless (defined($conf{general}{num_threads})) { - $conf{general}{num_threads} = 1; - } - $conf{general}{num_threads}=24 if($conf{general}{num_threads}>24); - - if($list_prgs!~/links2compare/){ - - foreach my $p (@general_params){ - die("Error Config : The parameter \"$p\" is not defined\n") if (!defined $conf{general}{$p}); - } - - $conf{general}{input_format}="sam" if($conf{general}{input_format} eq "bam"); - - unless (defined($conf{general}{sv_type})) { - $conf{general}{sv_type} = "all"; - } - - $conf{general}{read_lengths}={ 1=> $conf{general}{read1_length}, 2=> $conf{general}{read2_length}}; - } - - if($list_prgs=~/(linking|cnv)/){ - - foreach my $p (@detection_params){ - die("Error Config : The parameter \"$p\" is not defined\n") if (!defined $conf{detection}{$p}); - } - - die("Error Config : The parameter \"mates_file_ref\" is not defined\n") if($list_prgs=~/cnv/ && !defined $conf{detection}{mates_file_ref}); - - if($conf{detection}{step_length}>$conf{detection}{window_size}){ - die("Error Config : Parameter \"step_length\" should not exceed \"window size\"\n"); - } - - unless (-d $conf{general}{tmp_dir}."/mates"){ - mkdir $conf{general}{tmp_dir}."/mates" or die; - } - - if($list_prgs=~/linking/){ - unless (-d $conf{general}{tmp_dir}."/links"){ - mkdir $conf{general}{tmp_dir}."/links" or die; - } - } - if($list_prgs=~/cnv/){ - unless (-d $conf{general}{tmp_dir}."/density"){ - mkdir $conf{general}{tmp_dir}."/density" or die; - } - } - - } - - if($list_prgs=~/filtering/){ - - foreach my $p (@filtering_params) { - die("Error Config : The filtering parameter \"$p\" is not defined\n") if (!defined $conf{filtering}{$p}); - - } - - if(defined($conf{filtering}{chromosomes})) { - my @chrs=split(",",$conf{filtering}{chromosomes}); - my $exclude=($chrs[0]=~/^\-/)? 1:0; - for my $chrName (@chrs){ - - die("Error Config : The filtering parameter \"chromosomes\" is not valid\n") - if(($chrName!~/^\-/ && $exclude) || ($chrName=~/^\-/ && !$exclude)); - - } - } - - if (( $conf{filtering}{order_filtering} )&& !$conf{filtering}{strand_filtering}) { - die("Error Config : The parameter strand_filtering is set to \"0\" while order_filtering is selected". - "\nChange strand_filtering to \"1\" if you want to use the order filtering\n"); - } - if (( !defined($conf{filtering}{mu_length}) || !defined($conf{filtering}{sigma_length}) )&& $conf{filtering}{order_filtering}) { - die("Error Config : You should set parameters \"mu_length\" and \"sigma_length\" to use order filtering\n"); - } - if (( $conf{filtering}{insert_size_filtering} )&& !$conf{filtering}{strand_filtering}) { - die("Error Config : The parameter strand_filtering is set to \"0\" while insert_size_filtering is selected". - "\nChange strand_filtering to \"1\" if you want to use the insert size filtering\n"); - } - if (( !defined($conf{filtering}{mu_length}) || !defined($conf{filtering}{sigma_length}) )&& $conf{filtering}{insert_size_filtering}) { - die("Error Config : You should set parameters \"mu_length\" and \"sigma_length\" to use discriminate insertions from deletions\n"); - } - - if (!defined($conf{filtering}{indel_sigma_threshold})) { - $conf{filtering}{indel_sigma_threshold} = 2; - } - if (!defined($conf{filtering}{dup_sigma_threshold})) { - $conf{filtering}{dup_sigma_threshold} = 2; - } - if (!defined($conf{filtering}{singleton_sigma_threshold})) { - $conf{filtering}{singleton_sigma_threshold} = 4; - } - - if (!defined($conf{filtering}{nb_pairs_order_threshold})) { - $conf{filtering}{nb_pairs_order_threshold} = 1; - } - - if (!defined($conf{filtering}{final_score_threshold})) { - $conf{filtering}{final_score_threshold} = 0.8; - } - - if ($conf{filtering}{nb_pairs_order_threshold}>$conf{filtering}{nb_pairs_threshold}) { - die("Error Config : Parameter \"nb_pairs_order_threshold\" should not exceed \"nb_pairs_threshold\"\n"); - } - - } - - if($list_prgs=~/2circos$/){ - foreach my $p (@circos_params) { - next if($list_prgs=~/^ratio/ && $p eq "colorcode"); - die("Error Config : The circos parameter \"$p\" is not defined\n") if (!defined $conf{circos}{$p}); - } - } - - if($list_prgs=~/2bed$/){ - foreach my $p (@bed_params) { - die("Error Config : The bed parameter \"$p\" is not defined\n") if (!defined $conf{bed}{$p}); - } - } - - if($list_prgs=~/links2compare/){ - foreach my $p (@compare_params) { - die("Error Config : The compare parameter \"$p\" is not defined\n") if (!defined $conf{compare}{$p}); - } - - unless (defined($conf{compare}{same_sv_type})) { - $conf{compare}{same_sv_type} = 0; - } - - unless (defined($conf{compare}{min_overlap})) { - $conf{compare}{min_overlap} = 1E-9; - } - - if($conf{compare}{circos_output}){ - foreach my $p (@circos_params) { - next if($list_prgs=~/^ratio/ && $p eq "colorcode"); - die("Error Config : The circos parameter \"$p\" is not defined\n") if (!defined $conf{circos}{$p}); - } - } - if($conf{compare}{bed_output}){ - foreach my $p (@bed_params) { - die("Error Config : The bed parameter \"$p\" is not defined\n") if (!defined $conf{bed}{$p}); - } - die("Error Config : The compare parameter \"list_read_lengths\" is not defined\n") if (!defined $conf{compare}{list_read_lengths}); - - my @samples=split(",",$conf{compare}{list_samples}); - my @read_lengths=split(",",$conf{compare}{list_read_lengths}); - for my $i (0..$#samples){ - my @l=split("-",$read_lengths[$i]); - $conf{compare}{read_lengths}{$samples[$i]}={ 1=> $l[0], 2=> $l[1]}; - } - } - } - - -} -#------------------------------------------------------------------------------# -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# diff -r c284618dd8da -r 091714bd75a0 svdetect/SVDetect_run_parallel.xml --- a/svdetect/SVDetect_run_parallel.xml Tue Nov 06 10:06:26 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,324 +0,0 @@ - - -and identify structural variants - -SVDetect_run_parallel.pl - -#if $getLinks.linking == "linking" -linking - -#end if -#if $getFilteredLinks.filtering == "filtering" -filtering - -#if str($getFilteredLinks.links2SV) == "create" -links2SV --out3 '$sv_file' -#end if -#if $getFilteredLinks.file_conversion.file_conversion_select=="convert" and str($getFilteredLinks.file_conversion.links2circos) == "create" -links2circos --out4 '$circos_file' -#end if -#if $getFilteredLinks.file_conversion.file_conversion_select=="convert" and str($getFilteredLinks.file_conversion.links2bed) == "create" -links2bed --out5 '$bed_file' -#end if -#end if --conf '$config_file' --l '$log_file' --N '$sample_name' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ( - getFilteredLinks['filtering']=="filtering" and - getFilteredLinks['links2SV'] is True - ) - - - - ( - getFilteredLinks['filtering']=="filtering" and - getFilteredLinks['file_conversion']['file_conversion_select']=="convert" and - getFilteredLinks['file_conversion']['links2circos'] is True - ) - - - - ( - getFilteredLinks['filtering']=="filtering" and - getFilteredLinks['file_conversion']['file_conversion_select']=="convert" and - getFilteredLinks['file_conversion']['links2bed'] is True - ) - - - - - - - - - -<general> -input_format = bam -sv_type = ${sv_type} -mates_orientation=${mates_orientation} -read1_length=${read1_length} -read2_length=${read2_length} -mates_file=${mates_file} -cmap_file=${cmap_file} -tmp_dir=$__new_file_path__/svdetect/tmp -output_dir=$__new_file_path__/svdetect -num_threads=8 -</general> - -#if $getLinks.linking == "linking" -<detection> -#if str($getLinks.splitmate) == "split" -split_mate_file=1 -#else -split_mate_file=0 -#end if -window_size=${getLinks.window_size} -step_length=${getLinks.step_length} -</detection> -#end if - -#if $getFilteredLinks.filtering == "filtering" -<filtering> -#if str($getFilteredLinks.splitlink) == "split" -split_link_file=1 -#else -split_link_file=0 -#end if -#if str($getFilteredLinks.chromosomes) != "" -chromosomes=${getFilteredLinks.chromosomes} -#end if -nb_pairs_threshold=${getFilteredLinks.nb_pairs_threshold} -#if $getFilteredLinks.filter1.strand_filtering == "strand" -strand_filtering=1 -final_score_threshold=${getFilteredLinks.filter1.final_score_threshold} -#if $getFilteredLinks.filter1.filter2.order_filtering == "order" -order_filtering=1 -mu_length=${getFilteredLinks.filter1.filter2.mu_length} -sigma_length=${getFilteredLinks.filter1.filter2.sigma_length} -nb_pairs_order_threshold=${getFilteredLinks.filter1.filter2.nb_pairs_order_threshold} -#if $getFilteredLinks.filter1.filter2.filter3.insert_size_filtering == "insert" -insert_size_filtering=1 -indel_sigma_threshold=${getFilteredLinks.filter1.filter2.filter3.indel_sigma_threshold} -dup_sigma_threshold=${getFilteredLinks.filter1.filter2.filter3.dup_sigma_threshold} -singleton_sigma_threshold=${getFilteredLinks.filter1.filter2.filter3.singleton_sigma_threshold} -#else -insert_size_filtering=0 -#end if -#else -order_filtering=0 -#end if -#else -strand_filtering=0 -#end if -</filtering> -#end if - -#if $getFilteredLinks.filtering == "filtering" -#if $getFilteredLinks.file_conversion.file_conversion_select == "convert" -#if str($getFilteredLinks.file_conversion.links2circos) == "create" -<circos> -organism_id=${getFilteredLinks.file_conversion.organism_id} -<colorcode> -#for $color_repeat in $getFilteredLinks.file_conversion.color_code -${color_repeat.color}=${color_repeat.interval} -#end for -</colorcode> -</circos> -#end if -#if str($getFilteredLinks.file_conversion.links2bed) == "create" -<bed> -<colorcode> -#for $color_repeat in $getFilteredLinks.file_conversion.color_code -#if str($color_repeat.color)== "grey" -190,190,190=${color_repeat.interval} -#end if -#if str($color_repeat.color)== "black" -0,0,0=${color_repeat.interval} -#end if -#if str($color_repeat.color)== "blue" -0,0,255=${color_repeat.interval} -#end if -#if str($color_repeat.color)== "green" -0,255,0=${color_repeat.interval} -#end if -#if str($color_repeat.color)== "purple" -153,50,205=${color_repeat.interval} -#end if -#if str($color_repeat.color)== "orange" -255,140,0=${color_repeat.interval} -#end if -#if str($color_repeat.color)== "red" -255,0,0=${color_repeat.interval} -#end if -#end for -</colorcode> -</bed> -#end if -#end if -#end if - - - - -**What it does** - -SVDetect - Version : 0.8 - -Parallel version (nCPU=8) - -SVDetect is a application for the isolation and the type prediction of intra- and inter-chromosomal rearrangements from paired-end/mate-pair sequencing data provided by the high-throughput sequencing technologies - -This tool aims to identifying structural variations (SVs) with both clustering and sliding-window strategies, and helping in their visualization at the genome scale. -SVDetect is compatible with SOLiD and Illumina (>=1.3) reads. - -Manual documentation available at the http://svdetect.sourceforge.net/Site/Manual.html - ------ - -.. class:: infomark - -Contact Bruno Zeitouni (svdetect@curie.fr) for any questions or concerns about the Galaxy implementation of SVDetect. - - - - diff -r c284618dd8da -r 091714bd75a0 svdetect/circos_graph.xml --- a/svdetect/circos_graph.xml Tue Nov 06 10:06:26 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,290 +0,0 @@ - - -plots - -circos/bin/circos - --conf '$circos_config_file' --outputfile '${outputfile}.dat' --png - -> '$log_file' - -; - -rm '$outputfile'; ln -s '${outputfile}.png' '$outputfile' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -<ideogram> - -<spacing> - -default = 5u -break = 1u - -axis_break_at_edge = yes -axis_break = yes -axis_break_style = 2 - -<break_style 1> -stroke_color = black -fill_color = blue -thickness = 0.25r -stroke_thickness = 2 -</break> - -<break_style 2> -stroke_color = black -stroke_thickness = 3p -thickness = 1.5r -</break> - -</spacing> - -## thickness (px) of chromosome ideogram -thickness = 100p -stroke_thickness = 2 -## ideogram border color -stroke_color = black -fill = yes -## the default chromosome color is set here and any value -## defined in the karyotype file overrides it -fill_color = black - -## fractional radius position of chromosome ideogram within image -radius = 0.85r -show_label = yes -label_with_tag = yes -label_font = condensedbold -label_radius = dims(ideogram,radius) + 0.075r -label_size = 60p - -## cytogenetic bands -band_stroke_thickness = 2 - -## show_bands determines whether the outline of cytogenetic bands -## will be seen -show_bands = yes -## in order to fill the bands with the color defined in the karyotype -## file you must set fill_bands -fill_bands = yes - -</ideogram> - - - - - -show_ticks = yes -show_tick_labels = yes - -<ticks> -radius = dims(ideogram,radius_outer) -multiplier = 1e-6 - -<tick> -spacing = 0.5u -size = 2p -thickness = 2p -color = grey -show_label = no -label_size = 12p -label_offset = 0p -format = %.2f -</tick> - -<tick> -spacing = 1u -size = 3p -thickness = 2p -color = dgrey -show_label = no -label_size = 12p -label_offset = 0p -format = %.2f -</tick> - -<tick> -spacing = 5u -size = 5p -thickness = 2p -color = black -show_label = yes -label_size = 16p -label_offset = 0p -format = %d -</tick> - -<tick> -spacing = 10u -size = 8p -thickness = 2p -color = black -show_label = yes -label_size = 20p -label_offset = 5p -format = %d -</tick> -</ticks> - - - - -<colors> -<<include etc/colors.conf>> -</colors> - -<fonts> -<<include etc/fonts.conf>> -</fonts> - -<<include $ideogram_config_file>> -<<include $ticks_config_file>> - -karyotype = $karyotype - -<image> -24bit = yes -##png = yes -##svg = no -## radius of inscribed circle in image -radius = 1500p -background = white -## by default angle=0 is at 3 o'clock position -angle_offset = -90 -#angle_orientation = counterclockwise - -auto_alpha_colors = yes -auto_alpha_steps = 5 -</image> - -chromosomes_units= $chromosomes_units - -#if str($chromosomes)=="" -chromosomes_display_default = yes -#else -chromosomes_display_default = no -chromosomes = $chromosomes -#end if - -<links> - -z = 0 -radius = 0.95r -bezier_radius = 0.2r - -<link segdup> -show = yes -color = dgrey_a5 -thickness = 2 -file = $link_file -record_limit = 1000 -</link> - -</links> - - -anglestep = 0.5 -minslicestep = 10 -beziersamples = 40 -debug = no -warnings = no -imagemap = no - -units_ok = bupr -units_nounit = n - - - - -**What it does** - -Circos - -Manual documentation available at the http://circos.ca/ - - -**Example of link segdup file** - -segdup file:: - - 1 hs1 1077096 1078746 color=red - 1 hs1 1080923 1082805 color=red - 2 hs1 1137684 1137961 color=red - 2 hs3 1138138 1138423 color=red - 3 hs11 1169417 1170000 color=red - 3 hs11 1170025 1170975 color=red - 4 hs11 1222480 1224271 color=green - 4 hs11 1223328 1225675 color=green - 5 hs12 1223336 1225812 color=grey - 5 hs13 1224709 1227633 color=grey - 6 hs11 1223621 1226460 color=red - 6 hs11 1224918 1227633 color=red - 7 hs11 1399510 1401513 color=white - 7 hs11 1401628 1403697 color=white - 8 hs15 1652045 1653746 color=red - 8 hs15 1657167 1658940 color=red - 9 hs11 165333 165887 color=white - 9 hs11 165981 168016 color=white - 10 hs11 1702700 1702841 color=red - 10 hs11 1702903 1703057 color=red - 11 hs11 1912272 1915186 color=white - 11 hs11 1937111 1939824 color=white - 12 hs11 1983211 1983355 color=red - 12 hs11 1983591 1983748 color=red - 13 hs11 2913657 2913898 color=white - 13 hs11 2914048 2914341 color=white - 14 hs11 3090593 3090749 color=purple - 14 hs11 3090709 3090864 color=purple - 15 hs21 3466365 3466434 color=red - 15 hs21 3466554 3466620 color=red - 16 hsX 3603073 3603321 color=white - 16 hsX 3603295 3603520 color=white - - - ------ - -.. class:: infomark - -Contact Bruno Zeitouni (svdetect@curie.fr) for any questions or concerns about the Galaxy implementation of Circos. - - - - -