Mercurial > repos > iarc > mutspec
changeset 0:8c682b3a7c5b draft
Uploaded
author | iarc |
---|---|
date | Tue, 19 Apr 2016 03:07:11 -0400 |
parents | |
children | 748b7a8b634c |
files | Frequency-COSMICv72-Hupki.txt R/compareSignature_Galaxy.r R/estimateSign_Galaxy.r R/mutationSpectra_Galaxy.r R/somaticSignature_Galaxy.r R/transciptionalStrandBias.r README.txt hg19_listAVDB.txt mm9_listAVDB.txt mutspecAnnot.pl mutspecAnnot.xml mutspecAnnot_wrapper.sh mutspecCompare.xml mutspecCompare_wrapper.sh mutspecFilter.pl mutspecFilter.xml mutspecNmf.xml mutspecNmf_wrapper.sh mutspecSplit.pl mutspecSplit.xml mutspecStat.pl mutspecStat.xml mutspecStat_wrapper.sh tool-data/annovar_index.loc.sample tool_data_table_conf.xml.sample tool_dependencies.xml |
diffstat | 26 files changed, 7726 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Frequency-COSMICv72-Hupki.txt Tue Apr 19 03:07:11 2016 -0400 @@ -0,0 +1,97 @@ +Substitution Type Trinucleotide Somatic Mutation Type Signature 1 Signature 2 Signature 3 Signature 4 Signature 5 Signature 6 Signature 7 Signature 8 Signature 9 Signature 10 Signature 11 Signature 12 Signature 13 Signature 14 Signature 15 Signature 16 Signature 17 Signature 18 Signature 19 Signature 20 Signature 21 Signature 22 Signature 23 Signature 24 Signature 25 Signature 26 Signature 27 Signature 28 Signature 29 Signature 30 Signature 1 MEF Signature 2 MEF Signature 3 MEF Signature 5 MEF +C>A ACA A[C>A]A 0.0110983262 0.0006827082 0.0221723068 0.0365 0.0149415477 0.0017 0.0004 0.0367180038 0.012 0.0007 0.0002 0.0077 0.0003347572 0.0001 0.0013 0.0161 0.0018320192 0.0505364186 0.0107 0.0011799616 0.0001 0.0015040704 0.0004533607 0.0286459925 0.009896768 0.0020397729 0.0052056269 0.0013974388 0.0699819873 0 0.000781083 0.0037229109 0.0283533537 0.0003710632 +C>A ACC A[C>A]C 0.0091493407 0.0006191072 0.0178716754 0.0309 0.008960918 0.0028 0.0005 0.0332457222 0.0067 0.001 0.001 0.0047 0.0006487361 0.0042 0.004 0.0097 0.0003422356 0.0109398248 0.0074 0.0022115051 0.0007 0.002451011 0.0003668005 0.0202146384 0.0069989288 0.0014871623 0.0047382274 0.0009171877 0.0551523572 0 0.0022972224 0.0070460466 0.015676074 0.001672691 +C>A ACG A[C>A]G 0.0014900705 0.000099279 0.0021383396 0.0183 0.002207846 0.0005 0 0.0025253113 0.0005 0.0003 0 0.0017 3.8144594E-005 0.0005 0 0.0022 1.576225E-006 0.0022880727 0.0005 1.61691E-007 0 0 0 0.0204789965 0.001448443 0.0002839456 0.0007826979 0 0.017846984 0.0019673 0.0031701397 0.0025537924 0.0272331284 0.0007812591 +C>A ACT A[C>A]T 0.0062338852 0.0003238914 0.0162651456 0.0243 0.0092069053 0.0019 0.0004 0.0335985495 0.0068 0.0092 0.0002 0.0046 0.0008466585 0.0296 0.0057 0.0088 0.0031796648 0.0194240914 0.0074 0.00300801 0.0006 0.0009224525 0 0.0246001454 0.004966565 0.0005978656 0.0027182425 0.00051341 0.026804716 0 0.0015620621 0.0104484061 0.0079498813 0.0004287024 +C>A CCA C[C>A]A 0.0065958701 0.000677445 0.0187817256 0.0461 0.0096749043 0.0101 0.0012 0.0317237566 0.0098 0.0031 0.0007 0.0135 0.0017100896 0.0056 0.0106 0.0159 0.0010324302 0.0887681088 0.0112 0.0173771106 0.002 0.0045496929 0.0001647394 0.0635592838 0.0148329479 0.0037058501 0.0050650733 0.0011685156 0.0514102117 0 0.0094052338 0.0035222831 0.0414403498 0.0050521059 +C>A CCC C[C>A]C 0.0073423678 0.000213681 0.0157604578 0.0614 0.0049523006 0.0241 0.0006 0.0255054071 0.0057 0.0009 0.0017 0.0112 0.0011592566 0.0102 0.0084 0.01 0.0004218801 0.0206413906 0.0159 0.036502463 0.0014 0.0037644739 0.0007368748 0.0337570047 0.0078221753 0.0039807234 0.0022341533 0.0003342918 0.0258256508 0 0.0031118726 0.0059886212 0.0390237536 0.0007556355 +C>A CCG C[C>A]G 0.0008928404 6.77046E-006 0.0019633898 0.0088 0.0028006273 0.0091 0 0.0011596243 0 0.0007 0.001 0.0028 0.0002441665 0.0009 0.0015 0.0022 0.0002974628 0.0171784025 0.0018 0.0124825875 0.0027 0.0009001633 0.0001639537 0.0224289858 0.0012769767 0.000811742 0.0002663122 0.000053652 0.0144961833 0.0022624 0.0031056722 0.0027351313 0.0444175322 0.000426758 +C>A CCT C[C>A]T 0.0071865816 0.0004163329 0.0147228611 0.0432 0.0110134658 0.0571 0.0013 0.028791173 0.0091 0.016 0.0014 0.0071 0.0012567682 0.1257 0.0228 0.0084 3.1479429E-005 0.0376769589 0.0096 0.1034012262 0.0056 0.0044398462 0.0007227318 0.0200865154 0.0125636547 0.0190384313 0.00310057 0.0001866719 0.0403550741 0 0.0069708534 0.0135089174 0.0262144919 0.0012449334 +C>A GCA G[C>A]A 0.008232604 0.0003520134 0.0096965397 0.0376 0.011892169 0.0024 0.0003 0.0236823289 0.0118 0.0014 0.0004 0.0062 0.0001321096 0.0018 0.0024 0.0096 0.0065354049 0.1287241581 0.0032 0.0011161238 0.0001 0.0012983702 0.0003499075 0.0546764487 0.0134652951 0.0013753118 0.0107558719 0.0021366291 0.0780466101 0.008853 0.0038823793 0.0123398664 0.0263977944 0.0008119624 +C>A GCC G[C>A]C 0.0057580214 0.0001338169 0.0108433411 0.0399 0.0092478575 0.0058 0.0001 0.0158218964 0.0092 0.0022 0.001 0.0056 0.000754244 0.0114 0.0099 0.0094 0.001293804 0.0160928787 0.0031 0.0032714286 0.0038 0.0018096222 0.0005683378 0.109945989 0.0064682981 0.0019616391 0.0124423351 0.0004292084 0.0734609147 0.0093449 0.0070583387 0.0107953358 0.031326093 0.002913003 +C>A GCG G[C>A]G 0.0006163352 0.0001784417 0.0009291405 0.0227 0.0028091885 0.0021 0 0.0008509138 0 0.0002 0 0.0015 9.6820775E-005 0.0011 0.0013 0.0036 0.0012836634 0.0092714168 0 0.0037516125 0.0003 0.0003338381 0.0001494178 0.046524869 0.0006581031 1.3229664E-005 0.0016296283 0 0.0173832969 0.0008853 0.0015395666 0.0021848297 0.0263967356 2.28346914246428E-020 +C>A GCT G[C>A]T 0.0044590803 0.000122832 0.0122153826 0.0258 0.0103012675 0.0087 0.0001 0.0210612358 0.0085 0.0088 0.0006 0.0027 0.0005511037 0.0736 0.0309 0.0063 0.0031037667 0.0720170701 0.0072 0.0129371945 0.0023 0.0005191482 0 0.0504287095 0.0104730803 0.0019349963 0.0074123552 0.0013185073 0.0596024721 0.0081645 0.0046277367 0.0141566067 0.0151523048 0.0012329031 +C>A TCA T[C>A]A 0.0122500637 0.0151274275 0.0116532249 0.033 0.0147740162 0.0017 0.001 0.027032383 0.0222 0.0374 0.0009 0.0066 0.0481660479 0.0005 0.0057 0.0122 0.0018245583 0.0670806435 0.0051 0.0006882997 0.0001 0.0040516346 0.00207129 0.0314512632 0.0069381466 0.0026800624 0.0070792845 0.0007759028 0.0268944065 0 0.0087885822 0.0073157468 0.0118931991 0.0020899667 +C>A TCC T[C>A]C 0.0111622293 0.0065324925 0.016606775 0.0538 0.012043465 0.0029 0.002 0.0180897733 0.0043 0.0103 0.0025 0.0099 0.0173295571 0.0166 0.0062 0.0145 0.0012865183 0.0418690097 0.0063 0.0040582291 0.0005 0.0027397709 0.0006964102 0.0625382937 0.0102466074 0.0020316439 0.0062688006 0.0008292925 0.031608137 0 0.0045906893 0.0055555952 0.0228092194 0.0029764847 +C>A TCG T[C>A]G 0.0022754957 0.0016564554 0.0013572394 0.0104 0.0039023624 0.0011 0.0002 0.0016948752 0 0.0031 0.0001 0.0019 0.0022931573 0 0.0025 0.004 0.002172942 0.0151623793 0.0018 0.0010011022 0 0.0005131511 9.9393633E-005 0.0128025907 0 0.0002650717 0.0014464034 0 0.0125004682 0.0016722 0.0007774908 3.70860800590008E-020 0.0151191495 2.28346914246428E-020 +C>A TCT T[C>A]T 0.0152591025 0.0123946107 0.016328076 0.037 0.0182433957 0.0058 0.0013 0.0381413309 0.0322 0.3083 0.0004 0.0049 0.0186345815 0.0516 0.0145 0.0157 0.0050123164 0.1178778467 0.0087 0.0122291045 0.0004 0.0036056509 0.0002074302 0.0302976796 0.0246257872 0.0030171875 0.0098062778 0.0019980737 0.0412071764 0 0.0056468497 0.0041456849 0.016509629 0.0003531941 +C>G ACA A[C>G]A 0.0018010684 0.000263481 0.0240026152 0.0097 0.0116710217 0.0013 0 0.0083568448 0.0048 0.0005 0.0007 0.0031 0.0037751653 0.0001 0.0011 0.0048 0.0016617042 0.0015168876 0.0058 0.0006970411 0.0005 0.000527644 0 0.0119930388 0.0080325676 0.0012728813 0.0013241632 0.000255195 0.0093019711 0 0.0039109532 0.0052086941 0.0112040012 0.0008072133 +C>G ACC A[C>G]C 0.0025809085 0.000269866 0.0121603037 0.0054 0.0072920886 0.0012 0 0.0043063803 0.0023 0.0003 0.0003 0.0015 0.0009208248 0 0.0001 0.0024 0.0016268984 0.0024987845 0.0019 0.00205931 0.0008 0 0 0.0084251198 0.0016359704 0.0015281949 0.0017710841 0.000268899 0.0034790489 0 0.0038915791 0.0021914116 0.004675425 2.28346914246428E-020 +C>G ACG A[C>G]G 0.000592548 0.0002192339 0.0052754195 0.0031 0.0023038392 0 0 0.0005844153 0 0 0 0 1.9890489E-005 0 0.0006 0 2.5801897E-005 0.002614509 0 0.000012735 0 0 0 0 0 0.0003072463 0 0 0.0001544335 0.00482 0.0015371874 3.70860800590008E-020 0.0113226789 2.28346914246428E-020 +C>G ACT A[C>G]T 0.0029639863 0.0006109735 0.0232776563 0.0054 0.0116962457 0.0018 0.0001 0.0086349069 0.0038 0.0002 0.0009 0.0025 0.0038606325 0.0001 0.001 0.0073 0.0013285076 0.0039830112 0.0072 0.0008485871 0.0018 0.0002994547 0 0.0038813832 0.0034284497 0.0024982518 0.0013227198 0.000307698 0.0039767899 0 0.0052747825 0.0105488189 0.0073894579 2.28346914246428E-020 +C>G CCA C[C>G]A 0.0012849834 2.7772077E-005 0.016832572 0.0105 0.0075375232 0 0.0001 0.0066190772 0.0018 0 0 0.0019 0.0053336514 0 0.0009 0.004 0.0019249788 0.0004584638 0.0002 1.331626E-005 0.0001 0.0005820177 0.0001990492 0.0134768216 0.0072007904 0.0012791061 0.0047306494 0.0001033661 0.0026188632 0 0.0031466824 0.0022969107 0.0094396006 0.001675995 +C>G CCC C[C>G]C 0.0007021348 0.0002796439 0.0135314415 0.0097 0.0076332267 0 0.0004 0.0060777029 0.002 0 0.0002 0.0019 0.0009123035 0 0.0003 0.005 0.0007093678 0.0036542283 0.0028 0.0005481447 0 0.0008252996 0.000283776 0.0161056877 0.0045984731 0.0012149773 0.0012616448 0.0001443108 0.0011115123 0 0.0023630851 0.0028433597 0.0102951282 0.0008310079 +C>G CCG C[C>G]G 0.0005062896 1.9161576E-005 0.0041764575 0.0063 0.0026137604 0 0.0006 0.0006560334 0 0 0.0008 0 0.0002854673 0 0.0004 0.0028 3.517869E-006 0.0054057742 0.0002 0 0 0.0003821146 0 0.000797472 0.0008237093 0.0002078326 0.0003608566 0 0.001520047 0.0032461 0.0015435318 0.0054040236 0.0106255283 0.0008310033 +C>G CCT C[C>G]T 0.0013815427 0.0003127816 0.0240463904 0.0094 0.0094170979 0.0002 0.0003 0.007805027 0.0039 0 0.0002 0.0019 0.0063830976 0 0.0017 0.008 0.0009537763 0.0055196048 0.0008 0.0001207306 0.0003 0.0017606459 0.0001957753 0.0103886659 0.0001963603 0.0022970262 0.0002680262 0.0002909995 0.0001397507 0 0.00319971 3.70860800590008E-020 0.0068522711 0.0004239346 +C>G GCA G[C>G]A 0.0006021227 4.4838514E-005 0.0119168446 0.007 0.0055594231 0 0 0.0037245679 0.0011 0 0 0.0012 0.0013494196 0.0002 0.0022 0.0031 0.0012963242 0.0009238797 0.002 0.0013397693 0.0004 0 0 0.0049450607 0.0041696555 0.0013206075 0.0014181664 8.1591411E-005 0.0002657599 0.0073775 0.00153378 0.0041577528 0.0048126831 2.28346914246428E-020 +C>G GCC G[C>G]C 0.0023933522 1.4520103E-005 0.0098236534 0.0091 0.0053888012 0.003 0.0004 0.003046597 0.0029 0.0002 0.0002 0.0011 0.0007888077 0.0001 0.0005 0.0032 0.0002607609 0.0006486479 0 0.0018111515 0.0042 0 0.0001375011 0.0093698758 0.0023928049 0.0018455011 0.0013467168 0.000105219 0.0009450129 0.0065906 0.0608517207 0.0190915784 0.0509667244 0.0047167891 +C>G GCG G[C>G]G 2.48534E-007 4.0665889E-005 0.0016710543 0.0062 0.0011005896 0 0 0.0003212128 0 0 0 0 8.1791214E-005 0 0.0006 0 4.377552E-006 0.00054284 0 6.37538E-007 0 0.0002146959 0 0.0056257265 0.0004181939 0.0002050608 0 0 0 0.0015739 3.27595444415171E-020 0.0050892581 0.0129626353 0.0012642272 +C>G GCT G[C>G]T 0.0008900807 0.0002684295 0.0179143343 0.006 0.0060412478 0.0017 0 0.0057756636 0.0044 0.0004 0 0.0009 0.0026801232 0.0003 0.0018 0.0029 0.0018203928 0.0013767249 0 0.0012382642 0.0004 0.0005384388 0 0.0030557701 0.0019940122 0.0012255329 0.004560686 0.0001596794 0.0047967447 0 0.0086309368 0.022922438 0.011972958 0.0024557172 +C>G TCA T[C>G]A 0.0018748532 0.0372418531 0.0160414054 0.0032 0.002681057 0 0.0002 0.0041178788 0.0033 0 0.0001 0.0024 0.2802349465 0 0.0009 0.005 0.0003926847 0.0018029691 0.0005 2.4298469E-005 0.0002 0.0002254906 0.0002306089 0.0102208574 0.0097987246 0.0042018968 0.0028704337 0.0005213704 0.0008204449 0 0.0062124849 0.0015481065 0.0098767034 0.0003657106 +C>G TCC T[C>G]C 0.0020674188 1.9413411E-005 0.0201499204 0.0105 0.0079240483 0.0002 0.001 0.0037996496 0.0025 0 0.0011 0.0027 0.0638852866 0 0.0026 0.0064 0.0006556024 0.0078630068 0.0011 0.0001519986 0.0003 0.000628695 0.000165787 0.0075783653 0.0032665072 0.0028077615 0.0031079676 0.0006673259 0.0004441193 0 0.0046382358 0.0015499829 0.0104195164 2.28346914246428E-020 +C>G TCG T[C>G]G 0.000304897 0.0016254655 0.0025279109 0.0031 0.0013190756 0 0.0003 0.000025653 0 0 0.0003 0 0.009528486 0 0 0 1.3641541E-005 0.0015132233 0 0 0 0.000395808 0 0.0015927142 0.0020574368 6.197E-009 1.2810409E-005 0 0.0001432972 0.0019673 0.0023939233 0.0002293998 0.0040902531 2.28346914246428E-020 +C>G TCT T[C>G]T 0.0031515745 0.0668798997 0.0326736408 0.005 0.0066445957 0.0001 0.0007 0.0062475537 0.0049 0.0001 0.0008 0.0015 0.3302254817 0.0001 0.0015 0.0169 0.0028523804 0.0035671021 0.0105 0.00068306 0.0003 0.0002573751 0.0004389558 0.0061007272 0.0164009169 0.008019383 0.0021305875 0.0012457795 0.001369869 0 0.0109379855 0.0079398101 0.0198199846 0.0007988588 +C>T ACA A[C>T]A 0.0295145327 0.0074415568 0.0178721707 0.012 0.0218391876 0.0312 0 0.0180666872 0.0093 0 0.0225 0.0121 0.0014801808 0.0293 0.0117 0.0135 0.0093504884 0.0038256239 0.0221 0.0328580473 0.0051 0.0031133897 0.0197671544 0.0063155181 0.0209876181 0.005907232 0.0137232858 0.0054337925 0.0051972213 0.065119 0.0012054247 0.0320885261 0.0098060082 0.0225701805 +C>T ACC A[C>T]C 0.014322747 0.0027263124 0.0088960343 0.0075 0.0127561056 0.0163 0.0197 0.005650015 0.0056 0.0032 0.1099 0.0054 4.0508707E-005 0.0321 0.0169 0.0076 0.004224289 0.0024652143 0.0203 0.0221960469 0.0062 0.0015248603 0.0818805799 0.0038443269 0.0131412837 0.0106264578 0.005424306 0.0018154155 0.00897647 0.054397 0.0020840175 0.0247589098 0.000014969 0.1193404879 +C>T ACG A[C>T]G 0.1716469313 0.0033220833 0.0035727084 0.0028 0.0167601771 0.0908 0.0001 0.0192651055 0.0125 0.0126 0.0072 0.0019 0.0003165947 0.0496 0.0309 0.0035 0.0092292133 0.0084639569 0.0368 0.0345720147 0.0256 0.0036256412 0.0092487151 0.0031106644 0.0196598943 0.0199303232 0.0067064295 0.0054320393 0.0316389964 0.0204604 0.0054138542 0.0094650035 0.0135229545 0.0067748274 +C>T ACT A[C>T]T 0.0126237632 0.0033265284 0.0147976122 0.0059 0.0164779547 0.0149 0.0043 0.0208059675 0.0076 0.0047 0.0639 0.0067 0 0.0277 0.0066 0.0101 0.0057359612 0.0056057399 0.0266 0.0188195596 0.003 0 0.0332096522 0.0034204187 0.0057285193 0.0113345306 0.004475614 0.0016292132 0.0047696666 0.0219359 3.27595444415171E-020 0.039006119 0.0014857753 0.071059016 +C>T CCA C[C>T]A 0.020896447 0.0150195069 0.0143950595 0.021 0.0227692094 0.0085 0.0754 0.00486581 0.0098 0.0012 0.031 0.0187 0.0036472785 0.006 0.0051 0.021 0.0212652059 0.0097675115 0.0438 0.0111451853 0.0034 0.0077313495 0.0605726278 0.0060354474 0.0143339947 0.0065944449 0.0174384848 0.0043915326 0.0091264718 0.0694472 0.0155033821 0.0149734308 0.0134135166 0.0208371809 +C>T CCC C[C>T]C 0.0185017048 0.0035169181 0.0085447812 0.0144 0.0175094763 0.0099 0.1007 0.0039801504 0.0069 0.0024 0.1518 0.0094 0.0010042293 0.0167 0.0088 0.0098 0.0038472353 0.0056548285 0.0753 0.0181527469 0.0091 0.0071108505 0.1863756341 0 0.0059366437 0.006510632 0.0071448702 0.002647661 0.007067603 0.0638403 0.013021492 0.0277530953 0.0035533335 0.1090353647 +C>T CCG C[C>T]G 0.0955772173 0.0049792757 0.0035184658 0.0076 0.0128622376 0.0901 0.0208 0.0083386392 0.0076 0.0109 0.0135 0.0046 0.0001292021 0.0293 0.0155 0.0016 0.0131761123 0.016183223 0.0238 0.0443849425 0.0318 0.005502031 0.0151490826 0.0029244191 0.0115593916 0.0119049353 0.0075035617 0.0022845636 0.0198751804 0.0173126 0.0093878242 0.0106000807 0.0086266994 0.0033373046 +C>T CCT C[C>T]T 0.0171133076 0.0089565528 0.0160755457 0.0201 0.0204295134 0.0087 0.0788 0.0188436274 0.0097 0.0043 0.0816 0.0205 0.00061177 0.0186 0.0096 0.0211 0.0096480253 0.0051533186 0.1159 0.0138471362 0.004 0.0041697772 0.156015397 0.0039712016 0.0136325969 0.0062385563 0.0071480277 0.0024239843 0.0016340587 0.0342318 0.0037452885 0.0170644264 0.0034364665 0.0750337534 +C>T GCA G[C>T]A 0.0249438142 0.0063908079 0.0161272733 0.0087 0.0200382586 0.0653 0 0.0022612627 0.0062 0 0.0261 0.0124 0.0029341376 0.0557 0.0476 0.0125 0.0020687547 0.0027030687 0.0374 0.056793948 0.0163 0.0019153709 0.053282059 0.0236790513 0.010038112 0.009606515 0.0142153137 0.0033842409 0.011638793 0.0485934 0.0048229646 0.0892436272 0.0043067164 0.0306663268 +C>T GCC G[C>T]C 0.027161494 0.0019958168 0.0082086324 0.008 0.0180223122 0.0773 0.021 0.0016166492 0.0069 0.0134 0.0975 0.0092 0.0025151711 0.0618 0.1345 0.0092 0.0003254834 0.0041993075 0.0614 0.0544298573 0.0392 0.0032291335 0.1629183437 0.0159192018 0.0069538752 0.0195074267 0.0201436462 0.001687414 0.0037817517 0.0494787 0.0037100764 0.0856648665 0.0003026637 0.105842878 +C>T GCG G[C>T]G 0.1035707623 0.000303021 0.0012129229 0.0023 0.0131938176 0.1339 0.0002 0.0116067116 0.0088 0.027 0.009 0.0023 0.0032052646 0.0978 0.1829 0.0009 0.0075111014 0.0118973608 0.0278 0.0045926886 0.0019 0.0033005989 0.0130464947 0.0126480776 0.010468363 0.0225027174 0.0093684685 0.002189309 0.019684388 0.0157387 0.0091644502 0.0308714868 0.0101967765 0.0097694965 +C>T GCT G[C>T]T 0.0176898544 0.0032658182 0.0106116492 0.0082 0.0195029153 0.0524 0.0161 0.006284929 0.0101 0.0152 0.0522 0.0115 0.0016877305 0.0555 0.08 0.0116 0.0009256972 0.0019133809 0.0917 0.0495808638 0.0166 0.0011140596 0.1075903998 0.0181297784 0.0098238207 0.0173074722 0.007589626 0.0010439936 0.0057613368 0.0188865 0.0016456849 0.1762001422 0.000252515 0.0705253792 +C>T TCA T[C>T]A 0.0144920996 0.4199413996 0.0088802091 0.0035 0.0109975603 0.0074 0.1202 0.0071806685 0.005 0.0037 0.0302 0.0138 0.1138420932 0.0092 0.0112 0.0172 0.0161007846 0.0149233917 0.0218 0.0107103221 0.0064 0.0020167217 0.0145586137 0.0265718943 0.0110775208 0.0113032888 0.0187577765 0.0076222577 0.0120765831 0.0849892 0.00612291 0.016765482 0.0101245636 0.0216295113 +C>T TCC T[C>T]C 0.0176807754 0.0819724961 0.0135295728 0.007 0.0206449648 0.0067 0.2887 0.0040610329 0.0084 0.0211 0.1589 0.0095 0.015024839 0.0283 0.0156 0.0131 0.0167799841 0.0099627443 0.033 0.0081755098 0.0086 0.0019908342 0.0435104469 0.0120556911 0.0064487875 0.010807527 0.0079891844 0.0051166192 0 0.090301 0.0108921843 0.0179803954 0.003814104 0.1327623933 +C>T TCG T[C>T]G 0.0760022217 0.0477201864 0.0017054089 0.0011 0.0075344896 0.0391 0.0992 0.0055351215 0.0047 0.2141 0.008 0.001 0.006102098 0.0094 0.0104 0.0036 0.0162720956 0.0115497151 0.009 0.0231638992 0.0126 0.0014984731 0.0027526405 0.0053633597 0.0140834162 0.0103643293 0.004609131 0.0019180281 0.0158301968 0.0151485 0.0014706199 0.0150859914 0.0140381297 0.0050845027 +C>T TCT T[C>T]T 0.013761704 0.2286749183 0.0103044169 0.0077 0.0117874618 0.0047 0.0844 0.0122090705 0.0096 0.0392 0.0954 0.009 0.0281406612 0.0229 0.0076 0.0193 0.0198274176 0.005398014 0.0267 0.0095400872 0.0064 0.0011822265 0.0243287852 0.0055324992 0.0124185859 0.0072491454 0.0118809325 0.0061931586 0.0134621373 0.045544 0.0038110945 0.0225109222 0.0040578949 0.0981830992 +T>A ATA A[T>A]A 0.0040215203 1.32377E-007 0.0084285636 0.0048 0.0089029043 0.0006 0.001 0.0133650828 0.0121 0 0.0002 0.0058 0.0012436183 0 0.0021 0.0081 0.0006130025 0.0030372108 0.0011 0.0005744014 0.0083 0.0496192297 0 0.0011667471 0.0204305271 0.0044593399 0.143076389 0.0009640309 0 0.0075743 0.0213988628 0.0016153942 0.0036242193 2.28346914246428E-020 +T>A ATC A[T>A]C 0.0023711442 0.0001130696 0.0073730967 0.0039 0.0073992025 0.0033 0.0008 0.0124312021 0.0042 0.0002 0.0001 0.0065 0.0010600019 0.0008 0.009 0.0073 0.0018785875 0.0008111672 0.0053 0.0019714621 0.0292 0.0116667405 0 1.5173057E-005 0.0088925607 0.0128222429 0.0014640854 0.0033656678 0.0030495957 0.003738 0.0046842304 0.0039533946 0.001510433 0.0012607026 +T>A ATG A[T>A]G 0.00281091 0.0005329294 0.0073573 0.01 0.0115080515 0 0.0009 0.0140366571 0.0068 0 0.0007 0.0046 0.0001025544 0 0.0009 0.008 0.0010873497 0.0045032071 0.0045 0.0001905866 0.0027 0.0679154542 0.0002374185 0.0063324556 0.0062419485 0.0011723443 0.0012262809 0.0017857537 0.003909982 0.0065906 0.0583714179 0.0049727586 0.0085456931 3.92791895501103E-020 +T>A ATT A[T>A]T 0.0083609093 0.000149111 0.008753939 0.003 0.0111936232 0.0053 0.0035 0.0241099166 0.0185 0.0012 0.0003 0.006 0.0009718121 0.0009 0.0079 0.0084 0.0020454937 0.0027365232 0.0033 0.0074211035 0.0043 0.0145999176 0 4.967216E-006 0.0148508776 0.0039931375 0.002535198 0.0056530891 0.0051055274 0.0093447 0.0070547539 0.0025332903 0.0049596807 0.0003670923 +T>A CTA C[T>A]A 0.0011825874 0.000154599 0.0075713203 0.0075 0.0050431635 0.0001 0 0.0120242175 0.0055 0 0.0004 0.0017 0.000285159 0 0.0016 0.0054 3.0079241E-005 0.0004123464 0.0043 4.5042364E-005 0 0.0868985455 0.0005945285 0.0027666348 0.0262963231 0.0035614227 0.3306010175 0.00256658 0.0017654475 0.0053118 0.0483959361 3.70860800590008E-020 2.80409853263637E-020 0.0004214101 +T>A CTC C[T>A]C 0.0019031669 0.000464019 0.0127254635 0.0111 0.0062088437 0.0026 0.0014 0.0178810924 0.0049 0.0001 0.0011 0.0086 0.0002222873 0.0034 0.0043 0.0128 0.0102237477 0.0024914113 0.0053 0.0043890556 0.0037 0.0537068468 0 0.0026944625 0.0157998831 0.003901611 0.0028135988 0.0196108974 0.0003833591 0.0067873 0.0529238918 0.0055241267 0.0121689451 0.0004418274 +T>A CTG C[T>A]G 0.0014879606 0.0002304098 0.0115085619 0.0342 0.010507569 0.0008 0.0007 0.0163567745 0.0051 0.0003 0.0006 0.0031 0.0002869518 0.0002 0.0027 0.013 0.0044670362 0.0055230457 0.0021 0.0011278358 0.0025 0.2132208666 0.0001731204 0.0069111647 0.0628606064 0.0023903042 0.0043968571 0.0088336 0.0035568452 0.0082628 0.1374715155 0.0042244367 0.0171153481 0.0023322216 +T>A CTT C[T>A]T 0.0021793444 0.0005748856 0.0164561777 0.0115 0.0097846407 0.0011 0.0018 0.0262478019 0.0069 0.0009 0.0004 0.0054 8.00446E-007 0.002 0.0007 0.0096 0.0312448834 0.0033137133 0.0022 0.0023855413 0.001 0.0618275075 0.0004578131 0.0029152002 0.0366223511 0.0016359227 0.0054308916 0.0109426098 0.0022327555 0 0.0421848137 0.0095318815 0.0120375101 0.0032148689 +T>A GTA G[T>A]A 0.0006892894 0.0001149942 0.0044347826 0.0069 0.0067403877 0 0 0.0081391786 0.003 0 0.0003 0.0015 0.0001379604 0 0 0.003 8.545689E-006 0.0018754341 0.0026 0.0001631298 0 0.0537091457 0 0.0022251273 0.0362178008 0.0022428865 0.109851872 0.0005514608 0.0029300981 0.003738 0.0446039243 3.70860800590008E-020 0.0011458553 2.28346914246428E-020 +T>A GTC G[T>A]C 0.0005524095 0.0002938621 0.0056154324 0.0052 0.0040215965 0.0028 0 0.0079100803 0.0029 0.0001 0.0003 0.0039 0.0002528674 0.0025 0.0038 0.004 0.002573467 0.0015103098 0 0.0037991795 0.0142 0.0094893967 0.0001832038 0.0048531492 0.0058471096 0.0052072871 2.3455678E-005 0.0031639876 0 0.0033445 0.0125235888 0.0030277907 0.0012285467 0.0008083668 +T>A GTG G[T>A]G 0.0012002288 8.8721344E-005 0.0080701365 0.0133 0.0068143523 0.0006 0.0006 0.0090572631 0.0028 0 0.0004 0.0019 0.0001535175 0 0.0026 0.0051 0.0035388226 0.0015053688 0.0016 0.0008283337 0.0031 0.0502939024 0 0.0001957586 0.0382411793 0.0013581218 0.0015659372 0.002152847 0.0005336324 0.0050167 0.0460502984 0.0010906538 0.0077826056 0.0019375384 +T>A GTT G[T>A]T 0.0021071368 0.0002157555 0.0086791217 0.0045 0.0051010329 0.0021 0.001 0.0178122386 0.0036 0.0004 0.0001 0.0031 0.0001227456 0.0009 0.0043 0.0033 0.00387901 0.0036110252 0.0052 0.0019007466 0.0041 0.0106014572 0 0 0.0201679544 0.0025131129 0 0.0020077332 0 0.0063939 0.007042014 0.0029648102 0.0046125727 0.0021050728 +T>A TTA T[T>A]A 0.0056001554 8.0752486E-005 0.0071326814 0.0045 0.0092056373 0.0002 0.001 0.0151399962 0.0215 0 0.0003 0.0057 0.0004032613 0 0.0008 0.0085 0.0006988123 0.004663278 0.0039 0.000924548 0.0027 0.0408105028 0 0 0.021830329 0.0006281453 0.0528138875 0.0002032376 0.0011912074 0.0076726 0.0184166439 3.70860800590008E-020 0.002312234 2.28346914246428E-020 +T>A TTC T[T>A]C 0.0019990793 4.802898E-006 0.0091027978 0.0046 0.0068353442 0.0008 0.0015 0.0120935546 0.0027 0.0008 0 0.0132 0.0007718133 0.0016 0.0019 0.0087 0.0025793586 0.0025781555 0.0048 0.0007082068 0.0045 0.0255703958 0 0.0079941292 0.0034056867 0.0050737539 4.7091785E-005 0.0101809649 0.001682664 0.0064922 0.0160946246 0.0018285335 0.0003268915 0.0012342766 +T>A TTG T[T>A]G 0.0010900657 0.000066605 0.006565863 0.0082 0.0071442088 0 0.001 0.0083625466 0.014 0 0 0.0037 0.0002451942 0 0.0006 0.0063 2.1133639E-005 0.0020167384 0.0052 9.7003127E-005 0 0.0698452181 0.0002315256 0.0112021436 0.0101915204 2.0138391E-005 0 0.0032504272 0.002438756 0.0036396 0.040365198 0.0032796029 0.0053251107 0.0007275037 +T>A TTT T[T>A]T 0.0039810228 0.0002763159 0.0147121354 0.0045 0.0102408504 0.0007 0.005 0.0276532963 0.0139 0.0028 0.0008 0.0094 0.000160346 0.0012 0.0003 0.008 0.0038831306 0.0028186794 0.0085 0.0023134891 0.0008 0.0235790618 0 0.0002705788 0.0145275393 0.0012355385 0.0019297708 0.0164928123 0 0 0.0069546395 0.0010941496 0.0014059298 2.28346914246428E-020 +T>C ATA A[T>C]A 0.013915773 0.0013035096 0.0130357094 0.0084 0.0353667351 0.0075 0.0005 0.0162065101 0.0252 0.0031 0.0006 0.0466 0.0005494542 0.0026 0.0115 0.0717 0.002697314 0.0029330536 0.0145 0.016395063 0.0265 0.0168399309 0.0010684488 0.0047260091 0.0206632828 0.0550286521 0.0050473011 0.0040927469 0.0101191812 0 0.0007369268 0.0018982472 0.0032086778 0.0021449978 +T>C ATC A[T>C]C 0.0062749606 0.0004255767 0.0091862897 0.002 0.0137711021 0.0056 0.0001 0.0077878028 0.0105 0.0052 0.0015 0.0317 0.0001455067 0.0046 0.0073 0.0171 0.0080781501 0.0015132848 0.0048 0.0192517936 0.0165 0.0011082624 0.0002215732 0.0006279493 0.0095024784 0.0275946042 0.0018456912 0.0078807292 0.0063496827 0.0087547 0.0023157171 0.0082384411 0.0047295938 0.0016959289 +T>C ATG A[T>C]G 0.0101376362 0.000575473 0.0117169201 0.0081 0.0284490517 0.0217 0.0007 0.0098525836 0.0176 0.0026 0.0008 0.0487 0.000279178 0.0035 0.0073 0.0348 0.0044328707 0.003329669 0.0135 0.054784918 0.0224 0.0049136164 0.0004988015 0.0027237586 0.0199114925 0.0517910441 0.0056888139 0.0055208336 0.0146596103 0.0090498 0.0117071286 0.0155936441 0.0199843865 0.0033499277 +T>C ATT A[T>C]T 0.0092563164 0.0014881641 0.0169788708 0.0036 0.0273029742 0.0023 0.0011 0.0218294638 0.0247 0.004 0.0003 0.025 0.0002674614 0.0025 0.0063 0.0497 0.0031403702 0.0054913178 0.016 0.0073412699 0.0189 0.0026305239 0.0003132405 0.0071223685 0.0144883903 0.0390722444 0.0025927546 0.003232484 0.0078924134 0 0.0032016902 0.0009747108 0.0057597756 0.0016824476 +T>C CTA C[T>C]A 0.0041766749 0.0005466192 0.0078950126 0.0052 0.0142386298 0.0062 0.001 0.0051738424 0.0154 0.0005 0.0001 0.0312 4.2751466E-005 0.0006 0.0032 0.0146 0.0074269996 0.0001921693 0.0066 0.0148676965 0.0219 0.0040273463 0 0.0018488765 0.0090280149 0.0378893777 0.0048990792 0.0207380034 0 0 3.27595444415171E-020 3.70860800590008E-020 0.0037334463 0.0012694857 +T>C CTC C[T>C]C 0.0052525933 0.0003923326 0.0144311704 0.0026 0.0124895004 0.004 0.0008 0.0109368938 0.0118 0.0019 0.0004 0.0385 4.3307542E-005 0.0053 0.005 0.0194 0.0396570012 0.0030501926 0.007 0.018356161 0.01 0.0013524439 0 0.0044536899 0.0107716324 0.021740735 0.0057244485 0.0432837972 0.0027749537 0 0.0079452901 3.70860800590008E-020 0.0108649219 0.0037894225 +T>C CTG C[T>C]G 0.0070132253 0.0003620245 0.0084229721 0.01 0.0187416312 0.027 0.001 0.0056577736 0.0154 0.0015 0.0001 0.0415 5.4272534E-005 0.0016 0.0128 0.019 0.0286096963 0.0025964642 0.004 0.0741505626 0.0211 0.0027496661 0.0003207049 0 0.0133506613 0.0472401033 0.005661479 0.0370882617 0.0069024147 0 0.0053848162 0.0065013981 0.0174152084 0.0029716741 +T>C CTT C[T>C]T 0.0067138131 0.0005609001 0.0119324305 0.0054 0.0190540829 0.0033 0.0045 0.0111203893 0.0275 0.002 0.0009 0.0324 7.63944E-007 0.0027 0.0035 0.0263 0.1018696577 0.0013100287 0.0053 0.0110011183 0.0204 0.0023633535 0 0.001597389 0.00896103 0.0207406214 0.0039005891 0.005052716 0.0030414249 0 0.0126209242 0.0081108597 0.017359194 0.0016698115 +T>C GTA G[T>C]A 0.0112478351 4.673852E-006 0.0068500556 0.0061 0.0162949828 0.0122 0.0002 0.0057207785 0.0091 0.0064 0.001 0.0573 0.0010540812 0.0078 0.0195 0.0161 1.1401335E-005 0.0005291265 0.0039 0.0024693171 0.1426 0.0042174541 0.0002511686 0.0012697498 0.0174898239 0.0980528247 0.0047773804 0.0036550481 0.0070908038 0 0.0047403321 0.0019137689 0.0019750503 0.0016904982 +T>C GTC G[T>C]C 0.0069997243 0.0001860659 0.006260749 0.0016 0.0095745606 0.0059 0.0001 0.0051810515 0.0097 0.0101 0.0016 0.0274 0.0005298997 0.0127 0.0499 0.0099 0.0096577133 0.0022791308 0.003 0.0015656329 0.0665 0 0.0005382185 0.00671034 0.0040456124 0.040226364 0.0019649543 0.0070711753 0.0071113275 0.0065906 0.0095055012 0.0067394905 0.007108284 0.007601967 +T>C GTG G[T>C]G 0.0049775926 0.000000495 0.006098763 0.0042 0.0141039293 0.0115 0.0003 0.0050407075 0.0094 0.0054 0.0019 0.041 0.0006300957 0.0082 0.0154 0.0148 0.0046541291 0.0058710566 0.006 0.027341755 0.0884 0.0014748845 0 0.000277685 0.0134022463 0.044620701 0.0012291677 0.0082322384 0.0056430636 0.0078694 0.0062721008 0.0110147688 0.0095895668 0.001697214 +T>C GTT G[T>C]T 0.0106674061 0.0005785981 0.0075093439 0.0024 0.0156668747 0.0042 0.0081 0.0084882203 0.0126 0.0124 0.0023 0.0405 0.0006761879 0.0086 0.0094 0.0142 0.0081492346 0.0047715657 0.0099 0.0014982648 0.1003 0.0015610427 0.0009957697 0.0042384777 0.0141596937 0.0555280683 0.003747225 0.0004078845 0.0112376743 0.0084596 0.0022664575 0.0082567832 0.0036822191 0.0016897626 +T>C TTA T[T>C]A 0.0080736164 0.0001019324 0.0091154951 0.0028 0.017059641 0.0059 0.0023 0.0049010724 0.0156 0.003 0.0002 0.05 0.0007840561 0.0018 0.0068 0.0246 0.0017085867 0.0003023036 0.0052 0.0057342296 0.0389 0.0048036697 0 0.001682291 0.0145150501 0.0287896805 0.0017210153 0.0099295521 0.011498666 0 0.0014804542 3.70860800590008E-020 0.0005155448 0.0004342265 +T>C TTC T[T>C]C 0.0048573812 0.0004703437 0.0109537909 0.0016 0.0141957091 0.0035 0.0018 0.0060810113 0.0137 0.0097 0.0009 0.0342 0.0001329749 0.0056 0.0099 0.0183 0.0110741521 0.0062607911 0.002 0.0112342547 0.0313 0.0009121575 0.0002306089 0.0023445277 0.0074060835 0.0366394924 0.0034892126 0.0290105632 0.003584215 0.0091481 0.0007815096 0.001861857 0.0043626179 0.0012602376 +T>C TTG T[T>C]G 0.0083254542 0.0001923542 0.0061130332 0.0036 0.0125965017 0.0106 0.0019 0.0017124847 0.0098 0.0065 0.0007 0.032 3.7402404E-005 0.0034 0.0039 0.0165 0.0059320028 0.0015412653 0.0057 0.0237676113 0.0249 0.0017817357 0.0002395138 0.0004812742 0.0048925312 0.0191436943 0.0011528466 0.0156832186 0.0016942967 0.0060004 0.0007662323 3.70860800590008E-020 0.0026293456 0.0012747632 +T>C TTT T[T>C]T 0.0062571056 0.0005853303 0.0107743363 0.0022 0.0173751391 0.0029 0.0024 0.0100034418 0.0309 0.0099 0.0005 0.0371 0.0001797028 0.0026 0.0047 0.0174 0.0072082184 0.0013577971 0.0038 0.0053951863 0.0274 0 0 0.000358148 0.0170147808 0.0230716888 0.001527957 0.0172915636 0.0037997058 0 0.0032239339 0.0005133397 0.0022567107 2.28346914246428E-020 +T>G ATA A[T>G]A 0.0015876364 3.3718411E-005 0.0023513681 0 0.003462152 0 0 0.0042197193 0.0335 0 0 0.0012 0.0003861998 0 0 0.0052 5.3139249E-005 0.0003861382 0.0014 1.765291E-006 0 0.0016519986 0 0 0.0052816797 8.0653426E-005 0.0011363374 0.0063646756 0 0.0053118 0.0032703303 0.0015043531 0.0028023351 0.0003957761 +T>G ATC A[T>G]C 0.0017840913 2.4844826E-005 0.0014642311 0.0002 0.0022465927 0.0017 0 0.0007936713 0.0051 0.004 0 0.001 0.0001123832 0.002 0.0006 0.0042 0.0064686613 0.0012549821 0 0.0003813879 0.0031 0 0 0.0014639507 0 0.0001631712 0.0009589764 0.0166109056 0 0.0024592 0.0055761328 3.70860800590008E-020 0.0078161802 0.0016649968 +T>G ATG A[T>G]G 0.0013858306 0.0002734541 0.0090537776 0.0015 0.0054902168 0.0007 0.0009 0.0068658004 0.0052 0.0015 0.0001 0.0018 3.6000696E-005 0.0007 0.0012 0.0061 0.003481549 0.0023692161 0.001 0.0006587581 0.0016 0.0010868728 0 0.0008476125 0.0071364216 0.0012550795 6.766061E-005 0.0065928422 0.0029251714 0.0062955 0.0201754566 0.0174806227 0.0270345909 0.0016119211 +T>G ATT A[T>G]T 0.0031585393 0.0002176949 0.0070309764 0.0002 0.0038212267 0.0029 0 0.0029399223 0.0193 0.0137 0 0.003 0.0001924938 0.0056 0.0029 0.0076 0.0309263752 0.0003386029 0.0017 0.0005321252 0.0018 0 0 0.0010754553 0 0.0018503835 0.0002560277 0.0863928974 0 0.0095416 0.0054223733 0.0017155767 0.0053889069 0.0008560725 +T>G CTA C[T>G]A 0.0003026912 0.0001137846 0.0019741811 0 0.0020416489 0.0001 0 0.0024240506 0.0216 0.0022 0 0.0013 1.5094208E-005 0 0 0.0002 9.1469787E-005 6.1002548E-005 0 2.3264084E-005 0 0.0011203565 0 0 0 9.539626E-005 0 0.0095898419 0.0011168548 0.0046233 3.27595444415171E-020 3.70860800590008E-020 0.0005262194 0.0004210208 +T>G CTC C[T>G]C 0.0020985024 2.2095087E-005 0.0058242955 0.0013 0.0034785021 0.004 0.0008 0.0020962106 0.0064 0.0018 0.0001 0.0049 0.0001476248 0.002 0.0019 0.0082 0.0198142936 0.0020156969 0.0011 0.0084764945 0.0005 0.0010645836 0.0001901443 0.0021854373 0.0028864543 0.0039194551 0.000381245 0.0339132237 0 0.0060004 0.0039803758 0.0025376549 0.0060513622 2.28346914246428E-020 +T>G CTG C[T>G]G 0.0015995485 0.0002282459 0.0104646545 0.0046 0.0071474562 0.005 0.0009 0.0066040463 0.0126 0.0037 0.0011 0.0045 4.59873E-005 0.0005 0.0007 0.0067 0.0134498376 0.0053119673 0.0013 0.0153837761 0.0004 0.0041885681 0 0.0001139097 0.0117694925 0.0054973573 0.0004032572 0.0185164816 0.0021696933 0.0073775 0.0039200627 0.0028967736 0.007564699 0.0003827374 +T>G CTT C[T>G]T 0.0027585376 6.711134E-005 0.0087243873 0.0012 0.0114868115 0.0086 0.0013 0.0048667139 0.0509 0.0182 0.0009 0.0063 0.0004637147 0.0063 0.0045 0.0186 0.2614566141 0.0023416292 0.0019 0.0021853947 0.0032 0.0016294096 0.000210835 0.0025337183 0.0085508075 0.0067887187 0.0014390961 0.118967103 0.0011436633 0.0091481 0.0157841091 0.012870271 0.0262293556 0.0019670653 +T>G GTA G[T>G]A 0.000099045 9.5552392E-005 0.004144488 0 0.0016276645 0 0 0.0009745787 0.0072 0 0 0 1.8489857E-005 0 0 0 3.9193821E-005 0.0013415325 0 3.579149E-006 0 0 0 0 0.0055462269 3.7393232E-005 0 0 0 0.0032461 0.0015356557 0.0019915953 0.0015778632 0.0008546281 +T>G GTC G[T>G]C 0.0002023656 4.7002381E-005 0.0045019853 0 0.0003277349 0.0016 0 0.0005248216 0.0006 0.002 0 0.0004 9.3373513E-005 0.0019 0.0022 0.0032 0.0090784615 9.20431E-007 0 0.0028305751 0.0006 0 0 0.0010736323 0 0.002460705 5.0790565E-005 0 0 0.001869 0.0007606153 0.0041116159 0.0032012845 0.0008454968 +T>G GTG G[T>G]G 0.0011883532 0.0001099257 0.0163914526 0.0018 0.0059488798 0.001 0.0017 0.0060877535 0.005 0.0009 0.001 0.0011 1.0579194E-005 0.0012 0.0037 0.0008 0.0047827755 0.009695 0.0043 0.0027332194 0.0004 0.0018775892 0.0001871324 0.002924697 0.0086852444 0.0008172016 0.0047429186 0.0042178083 0.0036405938 0.0033445 0.0054079152 3.70860800590008E-020 0.002512037 0.0012687784 +T>G GTT G[T>G]T 0.0008007233 8.647718E-005 0.0070672366 0.0002 0.0033074666 0.0035 0.0009 0.0054274338 0.0185 0.003 0 0.0032 5.3741207E-005 0.0038 0.019 0.0044 0.0634977566 0.0028905535 0.0025 0.0035585586 0.0008 0 0 0.0008250983 0.0027685717 0.0078335613 0.002298476 0.0316489973 0.0061810238 0.0056069 0.0101885638 0.0066566288 0.0079578565 0.0029254388 +T>G TTA T[T>G]A 0.0013975537 0.000071737 0.0054271842 0 0.0052028744 0.0009 0 0.0017432214 0.0502 0.005 0 0.0019 0.0005465818 0 0 0.0068 0.0001334106 3.6318404E-005 0.0032 2.69147E-007 0 0.0016516988 0 0 0.0020814799 8.534935E-006 0.0011890225 0.0093896587 0 0.0086563 0.0008187477 0.000948451 0.0005293442 2.28346914246428E-020 +T>G TTC T[T>G]C 0.001291737 1.4281456E-005 0.0061602504 0.0003 0.0051316079 0.0019 0.001 0.0025498383 0.0081 0.0092 0 0.0027 0.0002353471 0.0015 0.0004 0.0069 0.0096133452 0.003233838 0.0018 0.0003772344 0.0018 0 0 0 0.0005789788 0.0027185098 0.0002802954 0.030117077 0 0.0043282 0.0015336834 0.0010831046 0.0010211012 2.28346914246428E-020 +T>G TTG T[T>G]G 0.0020310769 0.0002066152 0.0110765263 0.003 0.0060552541 0.0011 0.001 0.0060303952 0.0088 0.0022 0.0003 0.0011 0.000000479 0.0002 0.0009 0.0049 0.0045224623 0.0007546018 0.0011 0.0005154216 0.0003 0.002572752 0.0002475019 0.0013605049 0.0094291959 0.0013691612 0.0023530556 0.0126987508 0.000353696 0.0082628 0.0015835907 3.70860800590008E-020 0.003717572 2.28346914246428E-020 +T>G TTT T[T>G]T 0.0040301282 2.3598204E-005 0.0130009842 0.0011 0.0133699358 0.0072 0.0014 0.0072239989 0.0545 0.0633 0.0003 0.0032 0.0006705883 0.0025 0.0033 0.0163 0.0580404078 0.0021264415 0.0013 0.0006156567 0.0003 0 0 6.9515778E-005 0.0078696716 0.0025680767 0.0001395613 0.2336597833 0.0061048341 0 0.0031734006 0.001871395 0.0032018465 2.28346914246428E-020
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/R/compareSignature_Galaxy.r Tue Apr 19 03:07:11 2016 -0400 @@ -0,0 +1,125 @@ +#!/usr/bin/Rscript + +#-----------------------------------# +# Author: Maude # +# Script: compareSignature_Galaxy.r # +# Last update: 29/10/15 # +#-----------------------------------# + + +######################################################################################################################################### +# Compare new signatures with published one using the cosine similarity method # +######################################################################################################################################### + + +#------------------------------------------------------------------------------- +# Print a usage message if there is no argument pass to the command line +#------------------------------------------------------------------------------- +args <- commandArgs(TRUE) +usage <- function() +{ + msg <- paste0('Usage:\n', + ' compareSignature_Galaxy.r Published_Signature New_Signature Output_Folder\n' + ) + cat(msg, '\n', file="/dev/stderr") + quit(status=1) +} + +input = args[length(args)] + +if (length(args) == 0) { usage() } + + +#------------------------------------------------------------------------------- +# Load library +#------------------------------------------------------------------------------- +suppressMessages(suppressWarnings(library(lsa))) +suppressMessages(suppressWarnings(library(ggplot2))) +suppressMessages(suppressWarnings(library(reshape))) + +#------------------------------------------------------------------------------- +# Recover the arguments +#------------------------------------------------------------------------------- +published_signature_file <- args[1] # The matrix with the published signatures +unknown_signature_file <- args[2] # The matrix W from NMF from which we want to compare the signatures +dir <- args[3] # html directory + + +#------------------------------------------------------------------------------- +# Set the variables +#------------------------------------------------------------------------------- +# Create the outputs +output_cosineRes <- paste0(dir, "/Similarity_Matrix.txt") +output_png <- paste0(dir, "/Similarity_Matrix.png") + + +#------------------------------------------------------------------------------- +# Calculate the cosine similarity and represent it with a heatmap +#------------------------------------------------------------------------------- +# Published signatures +dataFrame1 <- read.table(published_signature_file, header=T, sep="\t") +# Remove the first three colmumns (Substitution Type, Trinucleotide Somatic, Mutation Type) +dataFrame1 <- dataFrame1[,4:ncol(dataFrame1)] +matrix1 <- as.matrix(dataFrame1) + +# Unkown signatures +dataFrame2 <- read.table(unknown_signature_file, header=T, sep="\t") +# Remove the first two columns (alteration, context) +dataFrame2 <- dataFrame2[,3:ncol(dataFrame2)] +matrix2 <- as.matrix(dataFrame2) +# Recover the number of new signatures +NbNewSignature <- ncol(dataFrame2) - 1 + +# Combined the two matrices (published and unknown signatures) +input_matrix_cos <- cbind(matrix1, matrix2) +# Calculate the cosine similarity +cosine_res <- cosine(input_matrix_cos) + +# Keep only the comparison between the two matrices +nbSign <- ncol(matrix1)+1 # +1 for havng the first signature of the matrix1 +cosine_res_subset <- cosine_res[nbSign:nrow(cosine_res), 1:ncol(matrix1)] + +# Save the matrix +write.table(cosine_res_subset, file=output_cosineRes, quote=F, sep="\t", col.names=T, row.names=T) + +# Transform the matrix in a suitable format for ggplot2 +cosineRes_subset_melt <- melt(cosine_res_subset) +# Rename the columns +colnames(cosineRes_subset_melt) <- c("Unknown_Signatures", "Published_Signatures", "Similarity") +# Reorder the Signature for having the same order as in the matrix. Turn your 'signature' column into a character vector +cosineRes_subset_melt$Published_Signatures <- as.character(cosineRes_subset_melt$Published_Signatures) +#Then turn it back into an ordered factor +cosineRes_subset_melt$Published_Signatures <- factor(cosineRes_subset_melt$Published_Signatures, levels=rev(unique(cosineRes_subset_melt$Published_Signature))) + +# Base plot: heatmap +p1 <- ggplot(cosineRes_subset_melt, aes(x=Published_Signatures, y=Unknown_Signatures, fill=Similarity)) + geom_tile(colour="yellow") +scale_fill_gradientn(colours=c("yellow", "red")) + theme_classic() + +# Rename the signatures +if(basename(published_signature_file) == "Frequency-COSMICv72-Hupki.txt") +{ + p1 <- p1 + scale_x_discrete(breaks = c("Signature.1", "Signature.2", "Signature.3", "Signature.4", "Signature.5", "Signature.6", "Signature.7", "Signature.8", "Signature.9", + "Signature.10", "Signature.11", "Signature.12", "Signature.13", "Signature.14", "Signature.15", "Signature.16", "Signature.17", + "Signature.18", "Signature.19", "Signature.20", "Signature.21", "Signature.22", "Signature.23", "Signature.24", "Signature.25", + "Signature.26", "Signature.27", "Signature.28", "Signature.29", "Signature.30", + "Signature.1.MEF", "Signature.2.MEF", "Signature.3.MEF", "Signature.5.MEF"), + labels = c("(Age) Sign 1", "(AID/APOBEC) Sign 2", "(BRCA1/2) Sign 3", "(Smoking) Sign 4", "Sign 5", "(DNA MMR deficiency) Sign 6", "(UV) Sign 7", + "Sign 8", "(IgG) Sign 9", "(pol e) Sign 10", "(temozolomide) Sign 11", "Sign 12", "(AID/APOBEC) Sign 13", "Sign 14", + "(DNA MMR deficiency) Sign 15", "Sign 16", "Sign 17", "Sign 18", "Sign 19", "(DNA MMR deficiency) Sign 20", "Sign 21", "(AA) Sign 22", + "Sign 23", "(Aflatoxin) Sign 24", "Sign 25", "(DNA MMR deficiency) Sign 26", "Sign 27", "Sign 28", "(Tobacco chewing) Sign 29", "Sign 30", + "(AA) Sign 1 MEF", "(AID) Sign 2 MEF", "(BaP) Sign 3 MEF", "(MNNG) Sign 5 MEF") + ) +} + +# Flipped cartesian coordinates so that horizontal becomes vertical, and vertical, horizontal +p1 <- p1 + coord_flip() +# Remove the x axis line +p1 <- p1 + theme(axis.line.x=element_blank(), axis.line.y=element_blank()) +# Add the cosine value only if >= 0.9 +cosResLabel <- subset(cosineRes_subset_melt, round(cosineRes_subset_melt$Similarity, digits=2) >= 0.9) # Subset the data for keeping only the values greater thant 0.9 +p1 <- p1 + geom_text(data = cosResLabel, aes(x = Published_Signatures, y = Unknown_Signatures, label = round(cosResLabel$Similarity, 2))) + +graphics.off() +options(bitmapType='cairo') +png(output_png, width=3000, height=2000, res=300) +plot(p1) +invisible( dev.off() )
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/R/estimateSign_Galaxy.r Tue Apr 19 03:07:11 2016 -0400 @@ -0,0 +1,105 @@ +#!/usr/bin/Rscript + +#-----------------------------------# +# Author: Maude # +# Script: estimateSign_Galaxy.r # +# Last update: 22/07/15 # +#-----------------------------------# + +######################################################################################################################################### +# Estimate the number of signatures for NMF # +######################################################################################################################################### + +#------------------------------------------------------------------------------- +# Load library for recovering the arguments +#------------------------------------------------------------------------------- +suppressMessages(suppressWarnings(require("getopt"))) + + +#------------------------------------------------------------------------------- +# Recover the arguments +#------------------------------------------------------------------------------- +spec = matrix(c( + "input" , "i", 1, "character", + "stop", "stop", 1, "numeric", + "cpu", "cpu", 1, "integer", + "output", "o", 1, "character", + "help", "h", 0, "logical" + ), + byrow=TRUE, ncol=4 + ) + +opt = getopt(spec); + +# No argument is pass to the command line +if(length(opt) == 1) +{ + cat(paste("Usage:\n estimateSign_Galaxy.r --input <matrix> --stop <maxNbSign> --cpu <cpu> --output <output_filename.png>\n",sep="")) + q(status=1) +} + +# Help was asked for. +if ( !is.null(opt$help) ) +{ + # print a friendly message and exit with a non-zero error code + cat(paste("Usage:\n estimateSign_Galaxy.r --input <matrix> --stop <maxNbSign> --cpu <cpu> --output <output_filename.png>\n",sep="")) + q(status=1) +} + + + +#------------------------------------------------------------------------------- +# Load library +#------------------------------------------------------------------------------- +suppressMessages(suppressWarnings(library(NMF))) + + + ############################################################################### + # Load the functions # + ############################################################################### + +#------------------------------------------------------------------------------- +# Check the file doesn't have lines equal to zero +#------------------------------------------------------------------------------- +CheckFile <- function(rowsum, dataFrame, x) +{ + if(rowsum == 0) + { + write("\n\nERROR: There is not enough mutations for running NMF!!!", stderr()) + write(paste0("Input matrix contains at least one null row ", rownames(dataFrame)[x], "\n\n"), stderr()) + stop() + } +} + + + ############################################################################### + # Check file # + ############################################################################### + +# The input musn't contains lines equal to zero !!! +matrixNMF <- read.table(opt$input, header=T) +# suppresses the return of sapply function +invisible( sapply(1:nrow(matrixNMF), function(x) { CheckFile(rowSums(matrixNMF)[x], matrixNMF, x) } ) ) + + + + ############################################################################### + # Estimate the number of signatures # + ############################################################################### +# Estimate the number of signatures with our data +nbCPU <- paste0("vP", opt$cpu) +nbSign <- 2:opt$stop # The minum number of signatures can't be lower than 2 + +estim_r <- nmf(matrixNMF, method="brunet", nbSign, nrun=50, .opt=nbCPU) + +# Shuffle original data +v_random <- randomize(matrixNMF) +# Estimate quality measures from the shuffled data +estim_r_random <- nmf(v_random, method="brunet", nbSign, nrun=50, .opt=nbCPU) + +# Plot the estimation for our data and the random ones +graphics.off() +options(bitmapType='cairo') +png(opt$output, width=3000, height=2000, res=300) +plot(estim_r, estim_r_random) +invisible( dev.off() )
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/R/mutationSpectra_Galaxy.r Tue Apr 19 03:07:11 2016 -0400 @@ -0,0 +1,203 @@ +#!/usr/bin/Rscript + +#-----------------------------------# +# Author: Maude # +# Script: mutationSpectra_Galaxy.r # +# Last update: 23/07/15 # +#-----------------------------------# + +######################################################################################################################################### +# Represent the mutation spectra with a bar graph # +######################################################################################################################################### + +#------------------------------------------------------------------------------- +# Print a usage message if there is no argument pass to the command line +#------------------------------------------------------------------------------- +args <- commandArgs(TRUE) +usage <- function() +{ + msg <- paste0('Usage:\n', + ' mutationSpectra_Galaxy.r input_Mutation_Spectra Sample_Name Output_Folder_High_Resolution Output_Folder_Low_Resolution Count_ca Count_cg Count_ta Count_tc Count_tg\n', + '\ninput_Mutation_Spectra should be tab-separated: alteration context value\n', + '\nOutput_Folder_High_Resolution: Folder for saving the high resolution image (display on the HTML page)\n', + '\nOutput_Folder_Low_Resolution: Folder for saving the low resolution image (display on the Excel report)\n' + ) + cat(msg, '\n', file="/dev/stderr") + quit(status=1) +} + +input = args[length(args)] + +if (length(args) == 0) { usage() } + + +#------------------------------------------------------------------------------- +# Load library +#------------------------------------------------------------------------------- +suppressMessages(suppressWarnings(library(ggplot2))) +suppressMessages(suppressWarnings(library(reshape))) +suppressMessages(suppressWarnings(library(grid))) +suppressMessages(suppressWarnings(library(scales))) +suppressMessages(suppressWarnings(library(gridExtra))) + + + +#------------------------------------------------------------------------------- +# Recover the arguments +#------------------------------------------------------------------------------- +input <- args[1] +sampleName <- args[2] +output_html <- args[3] +output_report <- args[4] +count_ca <- as.numeric(args[5]) +count_cg <- as.numeric(args[6]) +count_ct <- as.numeric(args[7]) +count_ta <- as.numeric(args[8]) +count_tc <- as.numeric(args[9]) +count_tg <- as.numeric(args[10]) + +count_ca <- paste("C>A (", count_ca, ")") +count_cg <- paste("C>G (", count_cg, ")") +count_ct <- paste("C>T (", count_ct, ")") +count_ta <- paste("T>A (", count_ta, ")") +count_tc <- paste("T>C (", count_tc, ")") +count_tg <- paste("T>G (", count_tg, ")") + + + + ############################################################################### + # Load the functions # + ############################################################################### + +#------------------------------------------------------------------------------- +# Set the font depending on X11 availability +#------------------------------------------------------------------------------- +font <- "" +# Check the device available +device <- capabilities() +# X11 is available +if(device[5]) { font <- "Helvetica" } else { font <- "mono" } + +#------------------------------------------------------------------------------- +# My own thme +#------------------------------------------------------------------------------- +theme_custom <- function(base_size = 12, base_family = "") +{ + # Starts with theme_grey and then modify some parts + theme_grey(base_size = base_size, base_family = base_family) %+replace% + theme( + axis.text = element_text(size = rel(1), family=font), + axis.ticks = element_line(colour = "black"), + axis.line = element_line(colour = "black", size = .5), + legend.key = element_blank(), + panel.background = element_blank(), + panel.border = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank() + ) +} + + +#------------------------------------------------------------------------------- +# Customize the theme for adding a y axis +#------------------------------------------------------------------------------- +mytheme <- theme_custom() +mytheme$axis.line.x <- mytheme$axis.line.y <- mytheme$axis.line +mytheme$axis.line.x$colour <- 'white' + +#------------------------------------------------------------------------------- +# Set the decimal precision to 0.0 +#------------------------------------------------------------------------------ +fmt <- function() +{ + function(x) format(x,nsmall = 1,scientific = FALSE, digits=1) +} + + + + ############################################################################### + # MAIN # + ############################################################################### + +matrixW_inputggplot2 <- read.table(input, header=T) +matrixW_melt <- melt(matrixW_inputggplot2) +max_matrixW <- max(matrixW_inputggplot2[,3:ncol(matrixW_inputggplot2)]) + + +p <- ggplot(matrixW_melt, aes(x=context, y=value, fill=alteration)) + geom_bar(stat="identity", width=0.5) + facet_grid(variable ~ alteration, scales="free_y") +# Color the mutation like Alexandrov et al. +p <- p + scale_fill_manual(values=c("blue", "black", "red", "#828282", "#00CC33", "pink")) +# Remove the legend +p <- p + guides(fill=FALSE) +# customized theme (no background, no facet grid and strip, y axis only) +p <- p + mytheme +# Remove the title of the x facet strip +p <- p + theme(strip.text.x=element_blank(), strip.text.y=element_blank()) +# Remove the x axis label, thicks and title +p <- p + theme(axis.title.x=element_blank(), axis.ticks.x=element_blank(), axis.title.y=element_text(size=15)) +# Scale the y axis to the maximum value +p <- p + scale_y_continuous(limits=c(0,max_matrixW), oob=squish, breaks=c(0,max_matrixW), labels=fmt()) +# Rename the y axis +p <- p + ylab("percent") +# Add a title to the plot +p <- p + ggtitle(sampleName) + theme(plot.title = element_text(vjust = 3.4, family=font)) +# Add a top margin for writing the title of the plot +p <- p + theme(plot.margin=unit(c(.7,0,0,0), "cm")) +p <- p + scale_x_discrete(breaks = c("A_A","A_C","A_G","A_T", "C_A","C_C","C_G","C_T", "G_A","G_C","G_G","G_T", "T_A","T_C","T_G","T_T"), + labels =c('A\nA',"\nC","\nG","\nT", 'C\nA',"\nC","\nG","\nT", + 'G\nA',"\nC","\nG","\nT", 'T\nA',"\nC","\nG","\nT" + ) + ) + +#------------------------------------------------------------------------------------------------------------------------------ +# Change the color of the facets for the mutation type +#------------------------------------------------------------------------------------------------------------------------------ +cols <- rep( c("blue", "black", "red", "#828282", "#00CC33", "pink")) # Facet strip colours + +# Make a grob object +Pg <- ggplotGrob(p) +# To keep track of strip.background grobs +idx <- 0 +# Find each strip.background and alter its backround colour +for( g in 1:length(Pg$grobs) ) +{ + if( grepl( "strip.absoluteGrob" , Pg$grobs[[g]]$name ) ) + { + idx <- idx + 1 + sb <- which( grepl( "strip\\.background" , names( Pg$grobs[[g]]$children ) ) ) + Pg$grobs[[g]]$children[[sb]][]$gp$fill <- cols[idx] + } +} + +# Reduce the size of the facet strip +Pg$heights[[3]] = unit(.1,"cm") + + +# Save the plot for the HTML page (higher resolution) +graphics.off() # close graphics windows +# Use cairo device as isn't possible to install X11 on the server... +png(paste0(output_html, "/", sampleName, "-MutationSpectraPercent-Genomic.png"), width=3500, heigh=500, res=300, type=c("cairo-png")) +plot(Pg) +## Add label for the mutation type above the strip facet +grid.text(0.13, unit(0.90,"npc") - unit(1,"line"), label=count_ca) +grid.text(0.29, unit(0.90,"npc") - unit(1,"line"), label=count_cg) +grid.text(0.45, unit(0.90,"npc") - unit(1,"line"), label=count_ct) +grid.text(0.6, unit(0.90,"npc") - unit(1,"line"), label=count_ta) +grid.text(0.76, unit(0.90,"npc") - unit(1,"line"), label=count_tc) +grid.text(0.92, unit(0.90,"npc") - unit(1,"line"), label=count_tg) +invisible( dev.off() ) + +# Save the plot for the report +png(paste0(output_report, "/", sampleName, "-MutationSpectraPercent-Genomic-Report.png"), width=1000, heigh=150, type=c("cairo-png")) +plot(Pg) +## Add label for the mutation type above the strip facet +grid.text(0.13, unit(0.90,"npc") - unit(1,"line"), label=count_ca) +grid.text(0.29, unit(0.90,"npc") - unit(1,"line"), label=count_cg) +grid.text(0.45, unit(0.90,"npc") - unit(1,"line"), label=count_ct) +grid.text(0.6, unit(0.90,"npc") - unit(1,"line"), label=count_ta) +grid.text(0.76, unit(0.90,"npc") - unit(1,"line"), label=count_tc) +grid.text(0.92, unit(0.90,"npc") - unit(1,"line"), label=count_tg) +invisible( dev.off() ) + +# Delete the empty plot created by the script +if (file.exists("Rplots.pdf")) invisible( file.remove("Rplots.pdf") )
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/R/somaticSignature_Galaxy.r Tue Apr 19 03:07:11 2016 -0400 @@ -0,0 +1,467 @@ +#!/usr/bin/Rscript + +#-----------------------------------# +# Author: Maude # +# Script: somaticSignature_Galaxy.r # +# Last update: 29/07/15 # +#-----------------------------------# + + +######################################################################################################################################### +# Run NMF algorithm and represent the composition of somatic signatures and the contribution in each samples # +######################################################################################################################################### + +#------------------------------------------------------------------------------- +# Load library for recovering the arguments +#------------------------------------------------------------------------------- +suppressMessages(suppressWarnings(require("getopt"))) + + +#------------------------------------------------------------------------------- +# Recover the arguments +#------------------------------------------------------------------------------- +spec = matrix(c( + "input" , "i", 1, "character", + "nbSignature", "nbSign", 1, "integer", + "cpu", "cpu", 1, "integer", + "output", "o", 1, "character", + "help", "h", 0, "logical" + ), + byrow=TRUE, ncol=4 + ) + +opt = getopt(spec); + +# No argument is pass to the command line +if(length(opt) == 1) +{ + cat(paste("Usage:\n somaticSignature_Galaxy.r --input <matrix> --nbSignature <nbSign> --cpu <cpu> --output <outputdir>\n",sep="")) + q(status=1) +} + +# Help was asked for. +if ( !is.null(opt$help) ) +{ + # print a friendly message and exit with a non-zero error code + cat(paste("Usage:\n somaticSignature_Galaxy.r --input <matrix> --nbSignature <nbSign> --cpu <cpu> --output <outputdir>\n",sep="")) + q(status=1) +} + + + +#------------------------------------------------------------------------------- +# Load library +#------------------------------------------------------------------------------- +suppressMessages(suppressWarnings(library(NMF))) +suppressMessages(suppressWarnings(library(ggplot2))) +suppressMessages(suppressWarnings(library(reshape))) +suppressMessages(suppressWarnings(library(grid))) +suppressMessages(suppressWarnings(library(scales))) # Set the maximum value to the y axis (graph composition somatic signature) +suppressMessages(suppressWarnings(library(gridExtra))) # function "unit" + + + + ############################################################################### + # Load the functions # + ############################################################################### + +#------------------------------------------------------------------------------- +# Set the font depending on X11 availability +#------------------------------------------------------------------------------- +font <- "" +# Check the device available +device <- capabilities() +# X11 is available +if(device[5]) { font <- "Helvetica" } else { font <- "Helvetica-Narrow" } + +#------------------------------------------------------------------------------- +# My own theme +#------------------------------------------------------------------------------- +theme_custom <- function(base_size = 4, base_family = "") +{ + # Starts with theme_grey and then modify some parts + theme_grey(base_size = base_size, base_family = base_family) %+replace% + theme( + axis.text = element_text(size = rel(0.8), family=font), + axis.ticks = element_line(colour = "black", size=.2), + axis.line = element_line(colour = "black", size = .2), + axis.ticks.length= unit(.05, "cm"), + axis.ticks.margin= unit(.05, "cm"), # space between tick mark and tick label (‘unit’) + legend.key.size = unit(.2, "cm"), + legend.margin = unit(-.5, "cm"), + panel.background = element_blank(), + panel.border = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + strip.text.y = element_text(size = 3) + ) +} + +#------------------------------------------------------------------------------- +# Customize the theme for adding a y axis +#------------------------------------------------------------------------------- +mytheme <- theme_custom() +mytheme$axis.line.x <- mytheme$axis.line.y <- mytheme$axis.line +mytheme$axis.line.x$colour <- 'white' + +#------------------------------------------------------------------------------- +# Replace the signature number by alphabet letter +#------------------------------------------------------------------------------- +ConvertNb2Aphabet <- function(c) +{ + if(c == "row1" || c == "col1") { c <- "A" } else + if(c == "row2" || c == "col2") { c <- "B"} else + if(c == "row3" || c == "col3") { c <- "C"} else + if(c == "row4" || c == "col4") { c <- "D"} else + if(c == "row5" || c == "col5") { c <- "E"} else + if(c == "row6" || c == "col6") { c <- "F"} else + if(c == "row7" || c == "col7") { c <- "G"} else + if(c == "row8" || c == "col8") { c <- "H"} else + if(c == "row9" || c == "col9") { c <- "I"} else + if(c == "row10" || c == "col10") { c <- "J"} else + if(c == "row11" || c == "col11") { c <- "K"} else + if(c == "row12" || c == "col12") { c <- "L"} else + if(c == "row13" || c == "col13") { c <- "M"} else + if(c == "row14" || c == "col14") { c <- "N"} else + if(c == "row15" || c == "col15") { c <- "O"} else + if(c == "row16" || c == "col16") { c <- "P"} else + if(c == "row17" || c == "col17") { c <- "Q"} else + if(c == "row18" || c == "col18") { c <- "R"} else + if(c == "row19" || c == "col19") { c <- "S"} else + if(c == "row20" || c == "col20") { c <- "T"} else + if(c == "row21" || c == "col21") { c <- "U"} else + if(c == "row22" || c == "col22") { c <- "V"} else + if(c == "row23" || c == "col23") { c <- "W"} else + if(c == "row24" || c == "col24") { c <- "X"} else + if(c == "row25" || c == "col25") { c <- "Y"} else + if(c == "row26" || c == "col26") { c <- "Z"} else { c <- c } +} + +#------------------------------------------------------------------------------- +# Check the file doesn't have lines equal to zero +#------------------------------------------------------------------------------- +CheckFile <- function(rowsum, dataFrame, x) +{ + if(rowsum == 0) + { + write("\n\nERROR: There is not enough mutations for running NMF!!!", stderr()) + write(paste0("Input matrix contains at least one null row ", rownames(dataFrame)[x], "\n\n"), stderr()) + stop() + } +} + +#------------------------------------------------------------------------------- +# Contribution to Signature as the number of SBS per sample +#------------------------------------------------------------------------------- +Contri2SignSBS <- function(Total_SBS, Percent) +{ + Total_SBS*Percent/100 +} + +#------------------------------------------------------------------------------- +# Combined two plots and share the legend +#------------------------------------------------------------------------------- +grid_arrange_shared_legend <- function(...) +{ + plots <- list(...) + g <- ggplotGrob(plots[[1]] + theme(legend.position="bottom"))$grobs + legend <- g[[which(sapply(g, function(x) x$name) == "guide-box")]] + lheight <- sum(legend$height) + grid.arrange( + do.call(arrangeGrob, lapply(plots, function(x) + x + theme(legend.position="none"))), + legend, + ncol = 1, + heights = unit.c(unit(1, "npc") - lheight, lheight)) +} + +#------------------------------------------------------------------------------- +# Calculate the mean of each signatures in each cluster +#------------------------------------------------------------------------------- +meanCluster <- function(df) +{ + max <- opt$nbSignature+1 + sapply(2:max, function(x) { round(mean(as.numeric(as.matrix(df[,x]))), 2) } ) +} + + + + + ############################################################################### + # Check file # + ############################################################################### + +# The input musn't contains lines equal to zero !!! +matrixNMF <- read.table(opt$input, header=T) +# suppresses the return of sapply function +invisible( sapply(1:nrow(matrixNMF), function(x) { CheckFile(rowSums(matrixNMF)[x], matrixNMF, x) } ) ) + + + + ############################################################################### + # Run NMF # + ############################################################################### + +# Create the output directories +output_NMF <- paste0(opt$output, "/NMF") +dir.create(output_NMF) +output_Figures <- paste0(output_NMF, "/", "Figures") +dir.create(output_Figures) +output_Files <- paste0(output_NMF, "/", "Files") +dir.create(output_Files) + +# Define the output filenames +output_cluster <- paste0(output_Files, "/", "Cluster_MixtureCoeff.txt") +figure_cluster <- paste0(output_Figures, "/", "Heatmap_MixtureCoeff.png") +output_matrixW <- paste0(output_Files, "/", "MatrixW-Normto100.txt") +output_matrixW_ggplot2 <- paste0(output_Files, "/", "MatrixW-Inputggplot2.txt") +output_matrixH_ggplot2 <- paste0(output_Files, "/", "MatrixH-Inputggplot2.txt") +output_matrixH_cluster <- paste0(output_Files, "/", "Average_ContriByCluster.txt") +figure_matrixW_png <- paste0(output_Figures, "/", "CompositionSomaticMutation.png") +figure_matrixH_png <- paste0(output_Figures, "/", "ContributionMutationSignature.png") +figure_matrixH_cluster <- paste0(output_Figures, "/", "Average_ContriByCluster.png") + + +# Run NMF +# request a certain number of cores to use .opt="vP4" +nbCPU <- paste0("vP", opt$cpu) +res <- nmf(matrixNMF, opt$nbSignature, "brunet", nrun=200, .opt=nbCPU) + +# If there is more than 300 samples the creation of the heatmap returns an error +if(ncol(matrixNMF) <= 300) +{ + # Save the clustered heatmap generated by NMF + graphics.off() # close graphics windows + options(bitmapType='cairo') + png(figure_cluster) + coefmap(res, Colv="consensus") + dev.off() +} + +# Recover the matrix W and H +matrixW <- basis(res) +matrixH <- coef(res) + +# Recover the cluster of the samples +matrix_cluster <- cbind(as.numeric(predict(res, what="samples")), colnames(matrixNMF)) +colnames(matrix_cluster) <- c("Cluster", "Samples") + +## Save the cluster matrix +write.table(matrix_cluster, file=output_cluster, quote=F, sep="\t", col.names=T, row.names=F) + + + + ############################################################################### + # Composition of somatic signatures # + ############################################################################### + +# Normalize to 100% +matrixW_norm <- t((t(matrixW)/colSums(matrixW))*100) +# Add a column name +colnames(matrixW_norm) <- colnames(matrixW_norm, do.NULL = FALSE, prefix = "col") +# Replace the name of the columns by the signature name +colnames(matrixW_norm) <- sapply(1:length(colnames(matrixW_norm)), function(x) { ConvertNb2Aphabet(colnames(matrixW_norm)[x]) } ) + +# Split the sequence context from the mutation type +context <- c() # Create an empty vector for the sequence context +alteration <- c() # Create an empty vector for the mutation type +for(i in 1:nrow(matrixW_norm)) +{ + temp <- strsplit((strsplit(rownames(matrixW_norm)[i], ""))[[1]], "") + + context[i] <- paste0(temp[1], "_", temp[7]) + alteration[i] <- paste0(temp[3], temp[4], temp[5]) +} + +# Melt the matrix using the signatures as variable +matrixW_melt <- melt(matrixW_norm) + +# Add columns for the mutation type and the sequence context +matrixW_norm <- cbind(matrixW_norm, alteration, context) +# Reorder (alteration) for having the same order as in the matrice of published signatures +matrixW_norm <- matrixW_norm[order(matrixW_norm[,"alteration"], matrixW_norm[,"context"]), ] +# Reorder (columns) for having the same order as in the matrice of published signatures +matrixW_norm <- cbind(matrixW_norm[,c("alteration", "context")], matrixW_norm[,1:(ncol(matrixW_norm)-2)]) # Put the column alteration and context at the begining +# Save the matrix +write.table(matrixW_norm, file=output_matrixW, quote=F, sep="\t", col.names=T, row.names=F) + +# Add columns for the mutation type and the sequence context +matrixW_melt <- cbind(matrixW_melt, alteration) +matrixW_melt <- cbind(matrixW_melt, context) +# Rename the columns +colnames(matrixW_melt) <- c("", "Signature", "value", "alteration", "context") + +# Save the input for ggplot2 +input_ggplot2 <- as.matrix(matrixW_melt) +input_ggplot2 <- input_ggplot2[,2:ncol(input_ggplot2)] +write.table(input_ggplot2, file=output_matrixW_ggplot2, quote=F, sep="\t", col.names=T, row.names=F) + +# Maximum value of the y axis +max_matrixW <- as.numeric(max(matrixW_melt$value)) + + +# Base plot +p <- ggplot(matrixW_melt, aes(x=context, y=value, fill=alteration)) + geom_bar(stat="identity", width=0.5) + facet_grid(Signature ~ alteration, scales="free_y") +# Color the mutation types +p <- p + scale_fill_manual(values=c("blue", "black", "red", "#828282", "#00CC33", "pink")) +# Remove the legend +p <- p + guides(fill=FALSE) +# Customized theme (no background, no facet grid and strip, y axis only) +p <- p + mytheme +# Remove the title of the x facet strip +p <- p + theme(strip.text.x=element_blank()) +# Remove the x axis ticks and title +p <- p + theme(axis.title.x=element_blank(), axis.ticks.x = element_blank(), axis.title.y=element_text(size=5)) +# Rename the y axis +p <- p + ylab("% contribution to signatures") +# Set the maximum value of the y axis to the maximum value of the matrix W +p <- p + scale_y_continuous(limits=c(0,max_matrixW), oob=squish, breaks=c(0,round(max_matrixW))) +# Save some space for adding the sequence context at the bottom +p <- p + theme(plot.margin=unit(c(.3, 0, 0, 0), "cm")) +p <- p + scale_x_discrete(breaks = c("A_A","A_C","A_G","A_T", "C_A","C_C","C_G","C_T", "G_A","G_C","G_G","G_T", "T_A","T_C","T_G","T_T"), + labels =c('A\nA',"\nC","\nG","\nT", 'C\nA',"\nC","\nG","\nT", + 'G\nA',"\nC","\nG","\nT", 'T\nA',"\nC","\nG","\nT") + ) + + +#------------------------------------------------------------------------------------------------------------------------------ +# Change the color of the facets for the mutation type +#------------------------------------------------------------------------------------------------------------------------------ +cols <- rep( c("blue", "black", "red", "#828282", "#00CC33", "pink")) # Facet strip colours + +# Make a grob object +Pg <- ggplotGrob(p) +# To keep track of strip.background grobs +idx <- 0 +# Find each strip.background and alter its backround colour +for( g in 1:length(Pg$grobs) ) +{ + if( grepl( "strip.absoluteGrob" , Pg$grobs[[g]]$name ) ) + { + idx <- idx + 1 + sb <- which( grepl( "strip\\.background" , names( Pg$grobs[[g]]$children ) ) ) + Pg$grobs[[g]]$children[[sb]][]$gp$fill <- cols[idx] + } +} + +# Reduce the size of the facet strip +Pg$heights[[3]] = unit(.05,"cm") + + +#------------------------------------------------------------------------------------------------------------------------------ +# Save the graph in a png file +#------------------------------------------------------------------------------------------------------------------------------ +options(bitmapType='cairo') +png(figure_matrixW_png, width=1300, heigh=500, res=300, pointsize = 4) +plot(Pg) +## Add label for the mutation type above the strip facet +grid.text(0.12, unit(1,"npc") - unit(1.4,"line"), label="C>A") +grid.text(0.27, unit(1,"npc") - unit(1.4,"line"), label="C>G") +grid.text(0.42, unit(1,"npc") - unit(1.4,"line"), label="C>T") +grid.text(0.58, unit(1,"npc") - unit(1.4,"line"), label="T>A") +grid.text(0.74, unit(1,"npc") - unit(1.4,"line"), label="T>C") +grid.text(0.89, unit(1,"npc") - unit(1.4,"line"), label="T>G") +invisible( dev.off() ) + + + + ############################################################################### + # Contribution of mutational signature in each samples # + ############################################################################### + +# Recover the total number of SBS per samples +NbSBS <- colSums(matrixNMF) +# Normalized matrix H to 100% +matrixH_norm <- t((t(matrixH)/colSums(matrixH))*100) +# Add a row name +rownames(matrixH_norm) <- rownames(matrixH_norm, do.NULL = FALSE, prefix = "row") +# Replace the signature number by letter +rownames(matrixH_norm) <- sapply(1:length(rownames(matrixH_norm)), function(x) { ConvertNb2Aphabet(rownames(matrixH_norm)[x]) } ) + +## Combined the contribution with the total number of SBS +matrixH_norm_melt <- melt(matrixH_norm) +matrixH_norm_melt <- cbind(matrixH_norm_melt, rep(NbSBS, each = opt$nbSignature)) +colnames(matrixH_norm_melt) <- c("Signature", "Sample", "Value", "Total_SBS") + +# Calculate the contribution in number of SBS +matrixH_norm_melt$ContriSBS <- sapply(1:nrow(matrixH_norm_melt), function(x) { Contri2SignSBS(matrixH_norm_melt$Total_SBS[x], matrixH_norm_melt$Value[x]) } ) + + +# Save the matrix +write.table(matrixH_norm_melt, file=output_matrixH_ggplot2, quote=F, sep="\t", col.names=T, row.names=F) + +# Base plot for the contribution of each samples according the count of mutations +p2 <- ggplot(matrixH_norm_melt, aes(x=reorder(Sample, -ContriSBS), y=ContriSBS, fill=Signature)) + geom_bar(stat="identity") + theme_classic() +# Remove the name of samples +p2 <- p2 + theme(axis.text.x = element_blank()) +# Reverse the y axis +p2 <- p2 + scale_y_reverse() +# Rename the y and x axis +p2 <- p2 + ylab("Number of mutations") + xlab("Samples") +# Remove the x axis line +p2 <- p2 + theme(axis.line.x=element_blank()) + +# Base plot for the contribution of each samples in percentages +p3 <- ggplot(matrixH_norm_melt, aes(x=reorder(Sample, -ContriSBS), y=Value, fill=Signature)) + geom_bar(stat="identity") + theme_classic() + theme(axis.text.x = element_blank()) + xlab("") + ylab("% of mutations") +# Remove the x axis line +p3 <- p3 + theme(axis.line.x=element_blank(), axis.ticks.x=element_blank()) + + +# Plot PNG +png(figure_matrixH_png, width=3000, heigh=2000, res=300) +# Combined the two plots for the contribution of the samples +suppressWarnings( grid_arrange_shared_legend(p3, p2) ) +invisible( dev.off() ) + + + ############################################################################### + # Average contributions of each signature in each cluster # + ############################################################################### + +matrixH_cluster <- cbind(matrix_cluster[,1], t(matrixH_norm)) +colnames(matrixH_cluster) <- c("Cluster", colnames(t(matrixH_norm))) + +df <- as.data.frame(matrixH_cluster) + +tmp_mat <- sapply(1:opt$nbSignature, function(x) { meanCluster(df[df[,1] == x,]) } ) +# Add a name for the row and the col +rownames(tmp_mat) <- sapply(1:opt$nbSignature, function(x) { paste0("Sig. ", x) } ) +colnames(tmp_mat) <- sapply(1:opt$nbSignature, function(x) { paste0("Cluster ", x) } ) +tmp_mat <- t(tmp_mat) +# Recover the number of samples in each cluster +nbSampleByCluster <- sapply(1:opt$nbSignature, function(x) { as.numeric( strsplit( as.character(dim(df[df[,1] == x,])), " " ) ) } ) +# Combined the average contribution and the number of samples +tmp_mat <- cbind(tmp_mat, nbSampleByCluster[1,]) +# Add a name for the row and the col +colnames(tmp_mat)[opt$nbSignature+1] <- "Number of samples" +# Save the matrix +write.table(tmp_mat, file=output_matrixH_cluster, quote=F, sep="\t", col.names=T, row.names=T) + +## Create an image of the table with ggplot2 +# Dummy plot +p4 <- qplot(1:10, 1:10, geom = "blank") + + theme(panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.border = element_rect(fill=NA,color="white", size=0.5, linetype="solid"), + axis.line = element_blank(), + axis.ticks = element_blank(), + panel.background = element_rect(fill="white"), + plot.background = element_rect(fill="white"), + legend.position = "none", + axis.text = element_blank(), + axis.title = element_blank() + ) +# Adding a table +p4 <- p4 + annotation_custom(grob = tableGrob(tmp_mat), + xmin = 4, xmax = 7, + ymin = 0, ymax = 10) + +# Save the table +png(figure_matrixH_cluster, width=2500, heigh=1000, res=300) +# Combined the two plots for the contribution of the samples +plot(p4) +invisible( dev.off() ) + + +# Delete the empty plot created by the script +if (file.exists("Rplots.pdf")) invisible( file.remove("Rplots.pdf") )
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/R/transciptionalStrandBias.r Tue Apr 19 03:07:11 2016 -0400 @@ -0,0 +1,144 @@ +#!/usr/bin/Rscript + +#---------------------------------------------# +# Author: Maude # +# Script: transcriptionalStrandBias_Galaxy.r # +# Last update: 03/07/15 # +#---------------------------------------------# + +######################################################################################################################################### +# Transcriptional strand bias # +######################################################################################################################################### + +#------------------------------------------------------------------------------- +# Print a usage message if there is no argument pass to the command line +#------------------------------------------------------------------------------- +args <- commandArgs(TRUE) +usage <- function() +{ + msg <- paste0('Usage:\n', + ' transcriptionalStrandBias_Galaxy.r input Output_Folder_High_Resolution Output_Folder_Low_Resolution Label_Y_axis\n', + '\ninput should be tab-separated: MutationTypeContext Strand Value Sample\n', + '\nOutput_Folder_High_Resolution: Folder for saving the high resolution image (display on the HTML page)\n', + '\nOutput_Folder_Low_Resolution: Folder for saving the low resolution image (display on the Excel report)\n', + '\nLabel_Y_axis: can be Count or Percent' + ) + cat(msg, '\n', file="/dev/stderr") + quit(status=1) +} + +input = args[length(args)] + +if (length(args) == 0) { usage() } + + + +#------------------------------------------------------------------------------- +# Load library +#------------------------------------------------------------------------------- +suppressMessages(suppressWarnings(library(ggplot2))) +suppressMessages(suppressWarnings(library(gridExtra))) + + +#------------------------------------------------------------------------------- +# Recover the argument pass in the command line +#------------------------------------------------------------------------------- +input <- args[1] +output <- args[2] +output_temp <- args[3] # Temp folder for the plot present in the Excel report +legend_y_axis <- args[4] + + +#------------------------------------------------------------------------------- +# Create the plot +#------------------------------------------------------------------------------- +## Load the data +txnSB <- read.table(input, header=T) +## Define the color for the transcribed (blue) and non-transcribed strand(red) +cb_palette_SB <- c("#0072B2", "#CC0000") +## Reorder the mutation on the x axis (same order as NMF) +txnSB$MutationTypeContext <- factor(txnSB$MutationTypeContext, + levels=c( + "C>A:A_A","C>A:A_C","C>A:A_G","C>A:A_T","C>A:C_A","C>A:C_C","C>A:C_G","C>A:C_T","C>A:G_A","C>A:G_C","C>A:G_G","C>A:G_T","C>A:T_A","C>A:T_C","C>A:T_G","C>A:T_T", + "C>G:A_A","C>G:A_C","C>G:A_G","C>G:A_T","C>G:C_A","C>G:C_C","C>G:C_G","C>G:C_T","C>G:G_A","C>G:G_C","C>G:G_G","C>G:G_T","C>G:T_A","C>G:T_C","C>G:T_G","C>G:T_T", + "C>T:A_A","C>T:A_C","C>T:A_G","C>T:A_T","C>T:C_A","C>T:C_C","C>T:C_G","C>T:C_T","C>T:G_A","C>T:G_C","C>T:G_G","C>T:G_T","C>T:T_A","C>T:T_C","C>T:T_G","C>T:T_T", + "T>A:A_A","T>A:A_C","T>A:A_G","T>A:A_T","T>A:C_A","T>A:C_C","T>A:C_G","T>A:C_T","T>A:G_A","T>A:G_C","T>A:G_G","T>A:G_T","T>A:T_A","T>A:T_C","T>A:T_G","T>A:T_T", + "T>C:A_A","T>C:A_C","T>C:A_G","T>C:A_T","T>C:C_A","T>C:C_C","T>C:C_G","T>C:C_T","T>C:G_A","T>C:G_C","T>C:G_G","T>C:G_T","T>C:T_A","T>C:T_C","T>C:T_G","T>C:T_T", + "T>G:A_A","T>G:A_C","T>G:A_G","T>G:A_T","T>G:C_A","T>G:C_C","T>G:C_G","T>G:C_T","T>G:G_A","T>G:G_C","T>G:G_G","T>G:G_T","T>G:T_A","T>G:T_C","T>G:T_G","T>G:T_T" + ) + ) +## Create a bar plot with custom color and classic theme +p_txnSB <- ggplot(txnSB, aes(x=MutationTypeContext, y=Value, fill=Strand)) +# Add a background for better differentiate the different mutation types +p_txnSB <- p_txnSB + geom_rect(data=NULL,aes(xmin=0.25,xmax=16.5,ymin=-Inf,ymax=Inf), fill="#E5E5E5") + + geom_rect(data=NULL,aes(xmin=16.5,xmax=32.5,ymin=-Inf,ymax=Inf), fill="#EDEDED") + + geom_rect(data=NULL,aes(xmin=32.5,xmax=48.5,ymin=-Inf,ymax=Inf), fill="#E5E5E5") + + geom_rect(data=NULL,aes(xmin=48.5,xmax=80.5,ymin=-Inf,ymax=Inf), fill="#EDEDED") + + geom_rect(data=NULL,aes(xmin=64.5,xmax=80.5,ymin=-Inf,ymax=Inf), fill="#E5E5E5") + + geom_rect(data=NULL,aes(xmin=80.5,xmax=96.5,ymin=-Inf,ymax=Inf), fill="#EDEDED") +# Add the bar +p_txnSB <- p_txnSB + geom_bar(stat="identity", width=0.5) + theme_classic() + scale_fill_manual(values=cb_palette_SB) + + +# Rename the y axis +p_txnSB <- p_txnSB + ylab(legend_y_axis) +## Set the legend position to the top of plot and remove the legend title +p_txnSB <- p_txnSB + theme(legend.position="top") + labs(fill="") +## Add margins for having place to add the mutation type labels bellow the bar graph +p_txnSB <- p_txnSB + theme(plot.margin=unit(c(1,1,-.1,1.5), "cm")) +## Rename the x labels +p_txnSB <- p_txnSB + scale_x_discrete(name="", + breaks=c( + "T>G:A_A","T>G:A_C","T>G:A_G","T>G:A_T","T>G:C_A","T>G:C_C","T>G:C_G","T>G:C_T","T>G:G_A","T>G:G_C","T>G:G_G","T>G:G_T","T>G:T_A","T>G:T_C","T>G:T_G","T>G:T_T", + "T>C:A_A","T>C:A_C","T>C:A_G","T>C:A_T","T>C:C_A","T>C:C_C","T>C:C_G","T>C:C_T","T>C:G_A","T>C:G_C","T>C:G_G","T>C:G_T","T>C:T_A","T>C:T_C","T>C:T_G","T>C:T_T", + "T>A:A_A","T>A:A_C","T>A:A_G","T>A:A_T","T>A:C_A","T>A:C_C","T>A:C_G","T>A:C_T","T>A:G_A","T>A:G_C","T>A:G_G","T>A:G_T","T>A:T_A","T>A:T_C","T>A:T_G","T>A:T_T", + "C>A:A_A","C>A:A_C","C>A:A_G","C>A:A_T","C>A:C_A","C>A:C_C","C>A:C_G","C>A:C_T","C>A:G_A","C>A:G_C","C>A:G_G","C>A:G_T","C>A:T_A","C>A:T_C","C>A:T_G","C>A:T_T", + "C>G:A_A","C>G:A_C","C>G:A_G","C>G:A_T","C>G:C_A","C>G:C_C","C>G:C_G","C>G:C_T","C>G:G_A","C>G:G_C","C>G:G_G","C>G:G_T","C>G:T_A","C>G:T_C","C>G:T_G","C>G:T_T", + "C>T:A_A","C>T:A_C","C>T:A_G","C>T:A_T","C>T:C_A","C>T:C_C","C>T:C_G","C>T:C_T","C>T:G_A","C>T:G_C","C>T:G_G","C>T:G_T","C>T:T_A","C>T:T_C","C>T:T_G","C>T:T_T", + "G>A:A_A","G>A:A_C","G>A:A_G","G>A:A_T","G>A:C_A","G>A:C_C","G>A:C_G","G>A:C_T","G>A:G_A","G>A:G_C","G>A:G_G","G>A:G_T","G>A:T_A","G>A:T_C","G>A:T_G","G>A:T_T", + "G>C:A_A","G>C:A_C","G>C:A_G","G>C:A_T","G>C:C_A","G>C:C_C","G>C:C_G","G>C:C_T","G>C:G_A","G>C:G_C","G>C:G_G","G>C:G_T","G>C:T_A","G>C:T_C","G>C:T_G","G>C:T_T", + "G>T:A_A","G>T:A_C","G>T:A_G","G>T:A_T","G>T:C_A","G>T:C_C","G>T:C_G","G>T:C_T","G>T:G_A","G>T:G_C","G>T:G_G","G>T:G_T","G>T:T_A","G>T:T_C","G>T:T_G","G>T:T_T", + "T>A:A_A","T>A:A_C","T>A:A_G","T>A:A_T","T>A:C_A","T>A:C_C","T>A:C_G","T>A:C_T","T>A:G_A","T>A:G_C","T>A:G_G","T>A:G_T","T>A:T_A","T>A:T_C","T>A:T_G","T>A:T_T", + "T>C:A_A","T>C:A_C","T>C:A_G","T>C:A_T","T>C:C_A","T>C:C_C","T>C:C_G","T>C:C_T","T>C:G_A","T>C:G_C","T>C:G_G","T>C:G_T","T>C:T_A","T>C:T_C","T>C:T_G","T>C:T_T", + "T>G:A_A","T>G:A_C","T>G:A_G","T>G:A_T","T>G:C_A","T>G:C_C","T>G:C_G","T>G:C_T","T>G:G_A","T>G:G_C","T>G:G_G","T>G:G_T","T>G:T_A","T>G:T_C","T>G:T_G","T>G:T_T" + ), + labels=c( + "A_A","A_C","A_G","A_T","C_A","C_C","C_G","C_T","G_A","G_C","G_G","G_T","T_A","T_C","T_G","T_T", + "A_A","A_C","A_G","A_T","C_A","C_C","C_G","C_T","G_A","G_C","G_G","G_T","T_A","T_C","T_G","T_T", + "A_A","A_C","A_G","A_T","C_A","C_C","C_G","C_T","G_A","G_C","G_G","G_T","T_A","T_C","T_G","T_T", + "A_A","A_C","A_G","A_T","C_A","C_C","C_G","C_T","G_A","G_C","G_G","G_T","T_A","T_C","T_G","T_T", + "A_A","A_C","A_G","A_T","C_A","C_C","C_G","C_T","G_A","G_C","G_G","G_T","T_A","T_C","T_G","T_T", + "A_A","A_C","A_G","A_T","C_A","C_C","C_G","C_T","G_A","G_C","G_G","G_T","T_A","T_C","T_G","T_T", + "A_A","A_C","A_G","A_T","C_A","C_C","C_G","C_T","G_A","G_C","G_G","G_T","T_A","T_C","T_G","T_T", + "A_A","A_C","A_G","A_T","C_A","C_C","C_G","C_T","G_A","G_C","G_G","G_T","T_A","T_C","T_G","T_T", + "A_A","A_C","A_G","A_T","C_A","C_C","C_G","C_T","G_A","G_C","G_G","G_T","T_A","T_C","T_G","T_T", + "A_A","A_C","A_G","A_T","C_A","C_C","C_G","C_T","G_A","G_C","G_G","G_T","T_A","T_C","T_G","T_T", + "A_A","A_C","A_G","A_T","C_A","C_C","C_G","C_T","G_A","G_C","G_G","G_T","T_A","T_C","T_G","T_T", + "A_A","A_C","A_G","A_T","C_A","C_C","C_G","C_T","G_A","G_C","G_G","G_T","T_A","T_C","T_G","T_T" + ) + ) +## Changing the appearance of x axis thicks +p_txnSB <- p_txnSB + theme(axis.text.x = element_text(angle=60, hjust=1, vjust=1)) +## Close graphics windows +graphics.off() +## Save the plot for the HTML page (higher resolution) +options(bitmapType='cairo') # # Use cairo device as isn't possible to install X11 on the server... +png(paste0(output, ".png"), width=4000, height=1000, res=300) +plot(p_txnSB) +# Add a label bellow the bar graph for indicating the mutation type +grid.text(paste("C>A", sep=""), x=unit(.14, "npc"), y=unit(.7, "npc"), just=c("left", "bottom"), gp=gpar(fontface="bold",fontsize=10)) +grid.text(paste("C>G", sep=""), x=unit(.29, "npc"), y=unit(.7, "npc"), just=c("left", "bottom"), gp=gpar(fontface="bold",fontsize=10)) +grid.text(paste("C>T", sep=""), x=unit(.45, "npc"), y=unit(.7, "npc"), just=c("left", "bottom"), gp=gpar(fontface="bold",fontsize=10)) +grid.text(paste("T>A", sep=""), x=unit(.58, "npc"), y=unit(.7, "npc"), just=c("left", "bottom"), gp=gpar(fontface="bold",fontsize=10)) +grid.text(paste("T>C", sep=""), x=unit(.74, "npc"), y=unit(.7, "npc"), just=c("left", "bottom"), gp=gpar(fontface="bold",fontsize=10)) +grid.text(paste("T>G", sep=""), x=unit(.9, "npc"), y=unit(.7, "npc"), just=c("left", "bottom"), gp=gpar(fontface="bold",fontsize=10)) +invisible( dev.off() ) + + + +# Save the plot for the report +p_txnSB +ggsave(paste0(output_temp, "-Report.png"), width=18) + +# Delete the empty plot created by the script +if (file.exists("Rplots.pdf")) invisible( file.remove("Rplots.pdf") )
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.txt Tue Apr 19 03:07:11 2016 -0400 @@ -0,0 +1,76 @@ +============================== + MutSpec-Suite +============================== + +Created by Maude Ardin and Vincent Cahais (Mechanisms of Carcinogenesis Section, International Agency for Research on Cancer F69372 Lyon France, http://www.iarc.fr/) + +Version 1.0 + +Released under GNU public license version 2 (GPL v2) + +Package description: Ardin et al. - 2016 - MutSpec: a Galaxy toolbox for streamlined analyses of somatic mutation spectra in human and mouse cancer genomes - BMC Bioinformatics + +Test data: https://usegalaxy.org/u/maude-ardin/p/mutspectestdata + + +### Requirements + + # python-dev +build-essential and python-dev packages must be installed on your machine before installing MutSpec tools: +$ sudo apt-get install build-essential python-dev + + + # Annovar +If you do not have ANNOVAR installed, you can download it here: http://www.openbioinformatics.org/annovar/annovar_download_form.php + +1) Once downloaded, install annovar per the installation instructions and edit the PATH variable in galaxy deamon (/etc/init.d/galaxy) to reflect the location of directory containing perl scripts. + +2) Create directories for saving Annovar databases + 2-a Create a folder (annovardb) for saving all Annovar databases, e.g. hg19db + 2-b Create a subfolder (seqFolder) for saving the reference genome, e.g. hg19db/hg19_seq + +3) Download the reference genome (by chromosome) from UCSC for all desired builds as follows: +$ annotate_variation.pl -buildver <build> -downdb seq <seqFolder> + +where <build> can be hg18, hg19 or hg38 for the human genome or mm9, mm10 for the mouse genome. +and <seqFolder> is the location where the sequences (by chromosme) should be stored, e.g. hg19db/hg19_seq + + +4) Download all desired databases for all desired builds as follows: +$ annotate_variation.pl -buildver <build> [-webfrom annovar] -downdb <database> <annovardb> + +/!\ At least the database refGene must be downloaded /!\ + +where <build> can be hg18, hg19 or hg38 for the human genome or mm9, mm10 for the mouse genome. +and <database> is the database file to download, e.g. refGene +and <annovardb> is the location where all database files should be stored, e.g. hg19db + +The list of all available databases can be found here: http://annovar.openbioinformatics.org/en/latest/user-guide/download/ + + +5) Edit the annovar_index.loc file (in the folder galaxy-dist/tool-data/toolshed/repos/iarc/mutspec/revision/) to reflect the location of annovardb folder (containing all the databases files downloaded from Annovar). +Restart galaxy instance for changes in .loc file to take effect or reload it into the admin interface. + +6) Edit the file build_listAVDB.txt in the mutspec install directory to reflect the name and the type of the databases installed + + +### Installation + + # MutSpec-Stat and MutSpec-NMF +By default 1 CPU is used by these tools, but you may edit mutspecStat_wrapper.sh and mutspecNmf_wrapper.sh to change this number to the maximum number of CPU available on your server. + +MutSpec-Stat and MutSpec-NMF tools allow parallel computations that are time consuming. +It is recommended to use the highest number of cores available on the Galaxy server to reduce the computation time of these tools. + + + + # MutSpec-Annot +The maximum CPU value needs to be specified when installing MutSpec package by editing the file mutspecAnnot.pl to reflect the maximum number of CPU available on your server (by default 1 CPU is used). + +This tool may be time consuming for large files. For example, annotating a file with more than 25,000 variants takes 1 hour using 1 CPU (2.6 GHz), while annotating this file using 8 CPUs takes only 5 minutes. +We have optimized MutSpec-Annot so that the tool uses more CPUs, if available, as follows: +-files with less than 5,000 lines: 1 CPU is used +-files with more than 5,000 and less than 25,000 lines: 2 CPUs are used +-files with more than 25,000 and less than 100,000 lines: 8 (or maximum CPUs, if less than 8 CPUs are available) are used (our benchmark results didn't show any time saving using more than 8 cores for files with more than 25,000 +but less than 100,000 lines) +-files with more than 100,000: maximum CPUs are used
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hg19_listAVDB.txt Tue Apr 19 03:07:11 2016 -0400 @@ -0,0 +1,37 @@ +#This is a sample file distributed with Galaxy that is used by the +#MutSpec-Annot tools. The hg19_listAVDB.txt has this format (white space +#characters are TAB characters): +# +#<RefGenome_DatabaseName> <operation> +# +# +# +#hg19_refGene.txt g +#hg19_genomicSuperDups.txt r +#hg19_snp138NonFlagged.txt f +hg19_refGene.txt g +hg19_knownGene.txt g +hg19_ensGene.txt g +hg19_cytoBand.txt r +hg19_gwasCatalog.txt r +hg19_genomicSuperDups.txt r +hg19_snp138.txt f +hg19_snp138NonFlagged.txt f +hg19_ALL.sites.2015_08.txt f +hg19_AFR.sites.2015_08.txt f +hg19_AMR.sites.2015_08.txt f +hg19_EAS.sites.2015_08.txt f +hg19_EUR.sites.2015_08.txt f +hg19_SAS.sites.2015_08.txt f +hg19_esp6500siv2_all.txt f +hg19_esp6500siv2_aa.txt f +hg19_esp6500siv2_ea.txt f +hg19_ljb26_sift.txt f +hg19_ljb26_pp2hdiv.txt f +hg19_ljb26_pp2hvar.txt f +hg19_cosmic70.txt f +hg19_exac03.txt f +hg19_exac03nontcga.txt f +hg19_exac03nonpsych.txt f +hg19_kaviar20150923.txt f +hg19_hrcr1.txt f
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mm9_listAVDB.txt Tue Apr 19 03:07:11 2016 -0400 @@ -0,0 +1,17 @@ +#This is a sample file distributed with Galaxy that is used by the +#MutSpec-Annot tools. The mm9_listAVDB.txt has this format (white space +#characters are TAB characters): +# +#<RefGenome_DatabaseName> <operation> +# +# +# +#mm9_refGene.txt g +#mm9_genomicSuperDups.txt r +#mm9_snp128.txt f +mm9_refGene.txt g +mm9_knownGene.txt g +mm9_ensGene.txt g +mm9_cytoBand.txt r +mm9_genomicSuperDups.txt r +mm9_snp128.txt f
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecAnnot.pl Tue Apr 19 03:07:11 2016 -0400 @@ -0,0 +1,1209 @@ +#!/usr/bin/env perl + +#-----------------------------------# +# Author: Maude # +# Script: mutspecAnnot.pl # +# Last update: 17/02/16 # +#-----------------------------------# + +use strict; +use warnings; +use Getopt::Long; +use Pod::Usage; +use File::Basename; # my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/); +use File::Path; +use Parallel::ForkManager; + + +our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested. +our ($refGenome, $output, $path_AVDB, $pathAVDBList, $folder_temp) = ("empty", "empty", "empty", "empty", "empty"); # The reference genome to use; The path for saving the result; The path to Annovar database; Text file with the list of the databases for Annovar; the path for saving the temporary files +our ($intervalEnd) = (10); # Number of bases for the flanking region for the sequence context. +our ($fullAVDB) = "yes"; # Add an option for using all Annovar databases for the annotation or only refGene + strand + context for having a quicker annotation (for large file with million of lines) + +GetOptions('verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'refGenome=s'=>\$refGenome, 'interval=i' => \$intervalEnd, 'fullAnnotation=s' => \$fullAVDB, 'outfile|o=s' => \$output, 'pathAnnovarDB|AVDB=s' => \$path_AVDB, 'pathAVDBList=s' => \$pathAVDBList, 'pathTemporary|temp=s' => \$folder_temp) or pod2usage(2); + +our ($input) = @ARGV; + +pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help); +pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man); +pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 0); # No argument is pass to the command line print the usage of the script +pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 2); # Only one argument is expected to be pass to @ARGV (the input) + + + +###################################################################################################################################################### +# GLOBAL VARIABLES # +###################################################################################################################################################### + +######################################### +### SPECIFY THE NUMBER OF CPU ### +######################################### +our $max_cpu = 1; # Max number of CPU to use for the annotation + + +# Recover the current path +our $pwd = `pwd`; +chomp($pwd); + +# Input file path +our @pathInput = split("/", $input); +# Output directories +our ($folderMutAnalysis, $folderAnnovar) = ("", ""); +# File with the list of Annovar databases to use +our $listAVDB = ""; +# Initialisation of chromosome, position, ref and alt values +our ($chrValue, $positionValue, $refValue, $altValue) = ("c", "s", "r", "a"); + + +###################################################################################################################################################### +# MAIN # +###################################################################################################################################################### +## Check the presence of the flags and create the output and temp directories +CheckFlags(); + +## Format the file in the correct format if they are vcf or MuTect output and recover the column positions +FormatingInputFile(); + +# Annotate the file with Annovar, add the strand orientation and the sequence context +FullAnnotation(); + +###################################################################################################################################################### +# FUNCTIONS # +###################################################################################################################################################### + +## Check the presence of the flags and create the output and temp directories +sub CheckFlags +{ + # Check the reference genome + if($refGenome eq "empty") { print STDERR "You forget to specify the name for the reference genome!!!\nPlease specify it with the flag --refGenome\n"; exit; } + if($intervalEnd eq "empty") { print STDERR "You forget to specify the length for the sequence context!!!\nPlease specify it with the flag --intervalEnd\n"; exit; } + # If no output is specified write the result as the same place as the input file + if($output eq "empty") + { + my $folderRes = ""; + for(my $i=0; $i<$#pathInput; $i++) { $folderRes .= "$pathInput[$i]/"; } + + $folderMutAnalysis = "$folderRes/Mutational_Analysis"; + if(!-e $folderMutAnalysis) { mkdir($folderMutAnalysis) or die "$!: $folderMutAnalysis\n"; } + } + else + { + if(!-e $output) { mkdir($output) or die "$!: $output\n"; } + + $folderMutAnalysis = "$output/Mutational_Analysis"; + if(!-e $folderMutAnalysis) { mkdir($folderMutAnalysis) or die "$!: $folderMutAnalysis\n"; } + } + # Create the output folder for Annovar + $folderAnnovar = "$folderMutAnalysis/Annovar"; + if(!-e $folderAnnovar) { mkdir($folderAnnovar) or die "$!: $folderAnnovar\n"; } + + # Verify the access to Annovar databases + if($path_AVDB eq "empty") { print STDERR "You forget to specify the path to Annovar databases!!!\nPlease specify it with the flag --pathAnnovarDB\n"; exit; } + elsif(!-e $path_AVDB) { print STDERR"\nCan't access Annovar databases!\nPlease check the access to the disk\n"; exit; } + + # Check the file list AV DB + if($pathAVDBList eq "empty") { print STDERR "You forget to specify the path to the list of Annovar databases!!!\nPlease specify it with the flag --pathAVDBList\n"; exit; } + else { $listAVDB = "$pathAVDBList/${refGenome}_listAVDB.txt" } + + # If no temp folder is specified write the result in the current path + if($folder_temp eq "empty") { $folder_temp = "$pwd/TEMP_MutationalAnalysis_$pathInput[$#pathInput]"; } + if(!-e $folder_temp) { mkdir($folder_temp) or die "$!: $folder_temp\n"; } +} + +## Format the file in the correct format if they are vcf or MuTect output and recover the column positions +sub FormatingInputFile +{ + # The input is a folder + if(-d $input) + { + foreach my $file (`ls $input`) + { + my $headerOriginalFile = ""; + chomp($file); + my ($filename, $directories, $suffix) = fileparse("$input/$file", qr/\.[^.]*/); + + CheckLengthFilename("$input/$file"); + + ################################################# + ### Recover the input format ### + ################################################# + RecoverInputFormat("$input/$file", \$headerOriginalFile); + } + } + # The input is one file + else + { + my $headerOriginalFile = ""; + + CheckLengthFilename($input); + my ($filename, $directories, $suffix) = fileparse($input, qr/\.[^.]*/); + + ################################################# + ### Recover the input format ### + ################################################# + RecoverInputFormat($input, \$headerOriginalFile); + } +} + +# The name for the Excel sheet can't be longer than 31 characters +sub CheckLengthFilename +{ + my ($inputFile) = @_; + + ## Verify the name of file, must be <= 31 chars for the sheet name + my ($filename, $directories, $suffix) = fileparse($inputFile, qr/\.[^.]*/); + + if(length($filename) > 32) { print STDERR "The file: $inputFile must be <= 31 chars\nPlease modify it before running the script\n"; exit; } +} + +# Recover the input format (vcf or txt) and depending on the format convert the input file in a suitable format for Annovar (ex: for MuTect files keep only the confident variants) +sub RecoverInputFormat +{ + my ($file, $refS_headerOriginalFile) = @_; + + my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/); + + my $inputFormat = ""; + + open(F1, $file) or die "$!: $file\n"; + my $header = <F1>; + close F1; + + ### VCF and MuTect files have in their first line the type of the file + if($header =~ /fileformat=VCF/i) { $inputFormat = "vcf"; } + elsif($header =~ /mutect/i) { $inputFormat = "mutect"; } + else { $inputFormat = "unknown"; } + + + ### VCF files + if($inputFormat eq "vcf") + { + open(F1, $file) or die "$!: $file\n"; + open(OUT, ">", "$folder_temp/$filename.txt") or die "$!: $folder_temp/$filename.txt\n"; + while (<F1>) + { + $_ =~ s/[\r\n]+$//; + my @tab = split("\t", $_); + # Print the VCF header + if($tab[0] eq "#CHROM") + { + $tab[0] =~ /#(.+)/; + print OUT "$1"; + for(my $i=1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; } + print OUT "\n"; + } + elsif($tab[0] !~ /##/) + { + # Don't consider chromosome random, GL and MT + if( ($tab[0] =~ /random/) || ($tab[0] =~ /GL/i) ) { next; } + print OUT "$_\n"; + } + } + close F1; close OUT; + + ## Recover the header + open(F1, "$folder_temp/$filename.txt") or die "$!: $folder_temp/$filename.txt\n"; + $$refS_headerOriginalFile = <F1>; + close F1; + + # Check if there if no empty column + CheckEmptyColumn("$folder_temp/$filename.txt"); + `rm $folder_temp/$filename.txt`; + + + # Set the col number for the chr,start,ref and alt + ($chrValue, $positionValue, $refValue, $altValue) = (0, 1, 3, 4); + } + ### MuTect files + elsif($inputFormat eq "mutect") + { + `sed '1d' $file > $folder_temp/$filename-HeaderOK`; + # Keep only the SNVs of good quality + `grep -v REJECT $folder_temp/$filename-HeaderOK > $folder_temp/$filename-KEEP.txt`; + `rm $folder_temp/$filename-HeaderOK`; + + # Recover the header + open(F1, "$folder_temp/$filename-KEEP.txt") or die "$!: $folder_temp/$filename-KEEP.txt\n"; + $$refS_headerOriginalFile = <F1>; + close F1; + + # Check if there if no empty column + CheckEmptyColumn("$folder_temp/$filename-KEEP.txt"); + `rm $folder_temp/$filename-KEEP.txt`; + + # Recover the name and the number of the columns that contain the chromosome number, the start position, the ref and alt alleles. + # Use the dictionary for recover the names and the position of the columns + RecoverColNameAuto("$folder_temp/$filename-KEEPColumnCorrect.txt", $$refS_headerOriginalFile, \$chrValue, \$positionValue, \$refValue, \$altValue); + } + ### Unknown type + else + { + ## Recover the header + open(F1, $file) or die "$!: $file\n"; + $$refS_headerOriginalFile = <F1>; + close F1; + + # Check if there if no empty column + CheckEmptyColumn($file); + + ## Recover the name and the number of the columns that contain the chromosome number, the start position, the ref and alt alleles. + # Use the dictionary for recover the names and the position of the columns + RecoverColNameAuto($file, $$refS_headerOriginalFile, \$chrValue, \$positionValue, \$refValue, \$altValue); + } +} + +# Some files can have empty column with no information +sub CheckEmptyColumn +{ + my ($inputFile) = @_; + + my ($filename, $directories, $suffix) = fileparse($inputFile, qr/\.[^.]*/); + + if($filename =~ /(.+)-KEEP/) { $filename = $1; } + + open(OUT, ">", "$folder_temp/$filename-ColumnCorrect.txt") or die "$!: $folder_temp/$filename-ColumnCorrect.txt\n"; + + open(F1, $inputFile) or die "$!: $inputFile\n"; + my $header = <F1>; $header =~ s/[\r\n]+$//; my @tabHeader = split("\t", $header); + print OUT $header, "\n"; + while(<F1>) + { + $_ =~ s/[\r\n]+$//; + my @tab = split("\t", $_); + + if(scalar(@tab) != scalar(@tabHeader)) + { + print OUT $tab[0]; + for(my $i=1; $i<=$#tabHeader; $i++) + { + if(defined($tab[$i])) { print OUT "\t$tab[$i]"; } + else { print OUT "\tNA"; } + } + print OUT "\n"; + } + else { print OUT "$_\n"; } + } + close F1; close OUT; +} + +# Dictionnary for extracting the name and number of columns for the chromosome, start position, ref and alt alleles. +sub RecoverColNameAuto +{ + our ($inputFile, $header, $ref_chrValue, $ref_positionValue, $ref_refValue, $ref_altValue) = @_; + + $header =~ s/[\r\n]+$//; + + ## Name of the columns + my @mutect = qw(contig position ref_allele alt_allele); + my @vcf = qw(CHROM POS REF ALT); + my @cosmic = qw(Mutation_GRCh37_chromosome_number Mutation_GRCh37_genome_position Description_Ref_Genomic Description_Alt_Genomic); + my @icgc = qw(chromosome chromosome_start reference_genome_allele mutated_to_allele); + my @tcga = qw(Chromosome Start_position Reference_Allele Tumor_Seq_Allele2); + my @ionTorrent = qw(chr Position Ref Alt); + my @proton = qw(Chrom Position Ref Variant); + my @varScan2 = qw(Chrom Position Ref VarAllele); + my @annovar = qw(Chr Start Ref Obs); + my @custom = qw(Chromosome Start Wild_Type Mutant); + + my @allTab = (\@mutect, \@vcf, \@cosmic, \@icgc, \@tcga, \@ionTorrent, \@proton, \@varScan2, \@annovar, \@custom); + my $timer = 0; # For controlling if the names are present on the dictionnary or not + + foreach my $refTab (@allTab) + { + my @tab = @$refTab; + + SearchCol(\@tab); + + # The columns names were find + if( ($$ref_chrValue ne "c") && ($$ref_positionValue ne "s") && ($$ref_refValue ne "r") && ($$ref_altValue ne "a") ) { last; } + # The names of the columns are not present in the dictionnary + else { $timer++; } + } + + if($timer == scalar(@allTab)) + { + print STDERR "The columns name are not in the dictionnary please change them before running the tool again\nFile concerning: $inputFile\n"; + print STDERR "TIP: Use one of the columns names proposed in the section Input formats of the tool\n"; + exit; + } + + # Extract the number of the column that contain the information + sub SearchCol + { + my ($refTab) = @_; + + my @tabNames = @$refTab; + my @tabHeader = split("\t", $header); + + # For VCF + if($tabHeader[0] eq "#CHROM") { ($$ref_chrValue, $$ref_positionValue, $$ref_refValue, $$ref_altValue) = (0, 1, 3, 4); } + # For tabular files + else + { + for(my $i=0; $i<=$#tabNames; $i++) + { + for(my $j=0; $j<=$#tabHeader; $j++) + { + if($tabHeader[$j] eq $tabNames[$i]) + { + if($i == 0) { $$ref_chrValue = $j; } + if($i == 1) { $$ref_positionValue = $j; } + if($i == 2) { $$ref_refValue = $j; } + if($i == 3) { $$ref_altValue = $j; } + last; # Once find pass to the next name + } + } + } + } + } +} + +# Annotate the file with Annovar, add the strand orientation and the sequence context +sub FullAnnotation +{ + print "-----------------------------------------------------------------\n"; + print "---------------------------Annotation----------------------------\n"; + print "-----------------------------------------------------------------\n"; + + + # If the input is a folder + if(-d $input) + { + foreach my $file (`ls $folder_temp/*.txt`) + { + chomp($file); + + # For recover the name of the file without extension, the directory where the file is and the extension of the file + my ($filename, $directories, $suffix) = fileparse("$folder_temp/$file", qr/\.[^.]*/); + my $filenameOK = ""; + # For removing the ColumnCorrect for txt files + if($filename =~ /(.+)-ColumnCorrect/) + { + if($filename =~ /(.+)-VariantListVCF-ColumnCorrect/) { $filenameOK = $1; } + else { $filenameOK = $1; } + } + else { print STDERR "Case not considered for $filename!!!\n"; exit; } + + + ################################################# + ### Cut the files in n part ### + ################################################# + # Recover the number of variants in the file for deciding the number of CPU to use + my $cpu = 0; + my $nbVariants = `wc -l $file`; + $nbVariants =~ /(\d+).+/; + + if($1-1 <= 5000) { $cpu = 1; } + elsif( ($1-1 > 5000) && ($1-1 < 25000) ) { $cpu = 2; } + elsif( ($1-1 >= 25000) && ($1-1 < 100000) ) { $cpu = 8; } + else { $cpu = $max_cpu; } + + # If the number predefined can't be used on the machine use the maximum number specify by the administrator + if($cpu > $max_cpu) { $cpu = $max_cpu } + + ## Recover the header + open(F1, $file) or die "$!: $file\n"; + my $headerOriginalFile = <F1>; + close F1; + + ## Remove the first line of the file + my $fileNoHeader = "$folder_temp/${filenameOK}-NoHeader"; + `sed 1d $file > $fileNoHeader`; + + if(!-e "$folder_temp/$filenameOK") { mkdir("$folder_temp/$filenameOK") or die "Can't create the directory $folder_temp/$filenameOK\n"; } + my $lines_per_temp = int(1+($1 / $cpu)); # +1 in case of the div == 0 + `split -l $lines_per_temp $fileNoHeader $folder_temp/$filenameOK/$filenameOK-`; + + if($headerOriginalFile eq "") { print STDERR "No header for the file $file!!!\nPlease check the format of your file\n"; exit; } + my @files = <$folder_temp/$filenameOK/$filenameOK-*>; + + ################################################# + ### Annotate the n part ### + ################################################# + my $pm = Parallel::ForkManager->new($cpu); + + foreach my $tempFile (@files) + { + # Forks and returns the pid for the child: + my $pid = $pm->start and next; + + # Convert the file in a correct format for Annovar: Chr Start End Ref Alt Otherinfo + my ($filename, $directories, $suffix) = fileparse($tempFile, qr/\-[^.]*/); + my $outFilenameTemp = $filename.$suffix; + Convert2AV($tempFile, $chrValue, $positionValue, $refValue, $altValue, "$folder_temp/$outFilenameTemp-AVInput"); + + # Annotate the file with Annovar + my $tempFileName_AVOutput = $filename.$suffix.".".${refGenome}."_multianno.txt"; + if($fullAVDB eq "yes") { AnnotateAV("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); } + else { annotateAV_min("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); } + + # Check if the annotations worked + open(F1, "$folderMutAnalysis/log_annovar.txt") or die "$!: $folderMutAnalysis/log_annovar.txt\n"; + while(<F1>) + { + if($_ =~ /ERROR/i) + { + print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n"; + print STDERR $_; + print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n\n"; + exit; + } + } + close F1; + + # Recover the strand orientation + my $length_AVheader = 0; + RecoverStrand("$folder_temp/$tempFileName_AVOutput", $headerOriginalFile, $path_AVDB, $refGenome, "$folder_temp/$outFilenameTemp-Strand", \$length_AVheader); + + # Recover the sequence context + RecoverGenomicSequence("$folder_temp/$outFilenameTemp-Strand", $length_AVheader, $intervalEnd, $refGenome, $path_AVDB, "$folder_temp/$filenameOK/$outFilenameTemp".".".${refGenome}."_multianno.txt"); + + $pm->finish; # Terminates the child process + } + # Wait all the child process + $pm->wait_all_children; + + # Paste the file together + CombinedTempFile("$folder_temp/$filenameOK", "$folderAnnovar/$filenameOK".".".${refGenome}."_multianno.txt"); + } + } + # The input file is one file + else + { + my ($filenameO, $directoriesO, $suffixO) = fileparse($input, qr/\.[^.]*/); + + ################################################# + ### Cut the files in n part ### + ################################################# + # Recover the number of variants in the file for deciding the number of CPU to use + my $cpu = 0; + my $nbVariants = `wc -l $folder_temp/$filenameO-ColumnCorrect.txt`; + $nbVariants =~ /(\d+).+/; + + if($1-1 <= 5000) { $cpu = 1; } + elsif( ($1-1 > 5000) && ($1-1 < 25000) ) { $cpu = 2; } + elsif( ($1-1 >= 25000) && ($1-1 < 100000) ) { $cpu = 8; } + else { $cpu = $max_cpu; } + + # If the number predefined can't be used on the machine use the maximum number specify by the administrator + if($cpu > $max_cpu) { $cpu = $max_cpu } + + ## Recover the header + open(F1, "$folder_temp/$filenameO-ColumnCorrect.txt") or die "$!: $folder_temp/$filenameO-ColumnCorrect.txt\n"; + my $headerOriginalFile = <F1>; + close F1; + + ## Remove the first line of the file + my $fileNoHeader = "$folder_temp/$filenameO-NoHeader"; + `sed 1d $folder_temp/$filenameO-ColumnCorrect.txt > $fileNoHeader`; + + if(!-e "$folder_temp/$filenameO") { mkdir("$folder_temp/$filenameO") or die "Can't create the directory $folder_temp/$filenameO\n"; } + my $lines_per_temp = int(1+($1 / $cpu)); # +1 in case of the div == 0 + `split -l $lines_per_temp $fileNoHeader $folder_temp/$filenameO/$filenameO-`; + + if($headerOriginalFile eq "") { print STDERR "No header for the file $input!!!\nPlease check the format of your file\n"; exit; } + my @files = <$folder_temp/$filenameO/$filenameO-*>; + + ################################################# + ### Annotate the n part ### + ################################################# + my $pm = Parallel::ForkManager->new($cpu); + foreach my $tempFile (@files) + { + # Forks and returns the pid for the child: + my $pid = $pm->start and next; + + # Convert the file in a correct format for Annovar: Chr Start End Ref Alt Otherinfo + # For recover the name of the file without extension, the directory were the file is and the extension of the file + my ($filename, $directories, $suffix) = fileparse($tempFile, qr/\.[^.]*/); + my $outFilenameTemp = $filename.$suffix; + Convert2AV($tempFile, $chrValue, $positionValue, $refValue, $altValue, "$folder_temp/$outFilenameTemp-AVInput"); + + # Annotate the file with Annovar + my $tempFileName_AVOutput = $outFilenameTemp.".".${refGenome}."_multianno.txt"; + if($fullAVDB eq "yes") { AnnotateAV("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); } + else { annotateAV_min("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); } + + + # Check if the annotations worked + open(F1, "$folderMutAnalysis/log_annovar.txt") or die "$!: $folderMutAnalysis/log_annovar.txt\n"; + while(<F1>) + { + if($_ =~ /ERROR/i) + { + print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n"; + print STDERR $_; + print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n\n"; + exit; + } + } + close F1; + + + # Recover the strand orientation + my $length_AVheader = 0; + RecoverStrand("$folder_temp/$tempFileName_AVOutput", $headerOriginalFile, $path_AVDB, $refGenome, "$folder_temp/$outFilenameTemp-Strand", \$length_AVheader); + + # Recover the sequence context + RecoverGenomicSequence("$folder_temp/$outFilenameTemp-Strand", $length_AVheader, $intervalEnd, $refGenome, $path_AVDB, "$folder_temp/$filenameO/$tempFileName_AVOutput"); + + $pm->finish; # Terminates the child process + } + # Wait all the child process + $pm->wait_all_children; + + + #### Paste the file together + CombinedTempFile("$folder_temp/$filenameO", "$folderAnnovar/$filenameO".".".${refGenome}."_multianno.txt"); + } + + # Remove the temporary directory + rmtree($folder_temp); +} + +sub Convert2AV +{ + my ($inputFile, $chr_value, $start_value, $ref_value, $alt_value, $output) = @_; + + my ($filename, $directories, $suffix) = fileparse($inputFile, qr/\.[^.]*/); + + open(F1, $inputFile) or die "$!: $inputFile\n"; + + open(OUT, ">", $output) or die "$!: $output\n"; + while(<F1>) + { + $_ =~ s/[\r\n]+$//; + my @tab = split("\t", $_); + my $chr = ""; + + # Don't consider chrM and GL + if($tab[$chr_value] =~ /M|GL/i) { next; } + + # Replace chr23 or chr24 by X or Y + if($tab[$chr_value] =~ /23/) { $chr = "chrX"; } + elsif($tab[$chr_value] =~ /24/) { $chr = "chrY"; } + elsif($tab[$chr_value] =~ /chr/) { $chr = $tab[$chr_value]; } + else { $chr = "chr".$tab[$chr_value]; } + + ### Reformat the Indels for Annovar + # chr1 85642631 C CT => chr1 85642631 85642631 - T (mm10) + # chr5 26085724 ACTT A => chr5 26085725 26085727 CTT - (mm10) + if( ((length($tab[$ref_value]) != 1) || (length($tab[$alt_value]) != 1)) || (($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") ) ) + { + ### First check if the indels in the file are not already correctly formated + if( ($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") ) + { + # For indels count the number of bases deleted or inserted for modifying the end position (if start + end is the same the annotations are not retrieved for indels) + # Insertion: start = start & end = start + if($tab[$ref_value] =~ /\-/) + { + print OUT "$chr\t$tab[$start_value]\t$tab[$start_value]\t$tab[$ref_value]\t$tab[$alt_value]"; + } + ## Deletion: start = start & end = start + length(del) -1 + else + { + my $end = $tab[$start_value] + (length($tab[$ref_value]) - 1); + print OUT "$chr\t$tab[$start_value]\t$end\t$tab[$ref_value]\t$tab[$alt_value]"; + } + } + ### Indels not correctly formated for Annovar + else + { + my @tabRef = split("", $tab[$ref_value]); + my @tabAlt = split("", $tab[$alt_value]); + + # Remove the first base + my $ref2 = join("", @tabRef[1 .. $#tabRef]); + my $alt2 = join("", @tabAlt[1 .. $#tabAlt]); + + if(length($alt2) == 0) + { + my $altOK = "-"; + my $startOK = $tab[$start_value] + 1; + my $stopOK = $startOK + length($ref2) - 1; + print OUT $chr."\t".$startOK."\t".$stopOK."\t".$ref2."\t".$altOK; + } + + if(length($ref2) == 0) + { + my $refOK = "-"; + print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$refOK."\t".$alt2; + } + } + } + ### SBS + else + { + print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$tab[$ref_value]."\t".$tab[$alt_value]; + } + + ## Print the original file at the end + foreach (@tab) { print OUT "\t$_"; } + print OUT "\n"; + } + close F1; close OUT; +} + +sub AnnotateAV +{ + my ($inputFile, $output) = @_; + + if(!-e $path_AVDB) { print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n"; print STDERR "Please install the database for this genome before running Annovar\n"; exit; } + + # Extract the name of the databases + my $protocol = ""; my $operation = ""; + ExtractAVDBName($listAVDB, \$protocol, \$operation); + + `table_annovar.pl $inputFile $path_AVDB -buildver $refGenome -protocol $protocol -operation $operation -remove -nastring NA -otherinfo -outfile $output > $folderMutAnalysis/log_annovar.txt 2>&1`; + + sub ExtractAVDBName + { + my ($listAVDB, $refS_protocol, $refS_operation) = @_; + + open(F1, $listAVDB) or die "$!: $listAVDB\n"; + while(<F1>) + { + if ($_ =~ /^#/) { next; } + + $_ =~ s/[\r\n]+$//; + my @tab = split("\t", $_); + + # db name like refGenome_dbName.txt + if( ($tab[0] =~ /\w+_(\w+)\.txt/) && ($tab[0] !~ /sites/) && ($tab[0] !~ /esp/) && ($tab[0] !~ /sift/) && ($tab[0] !~ /pp2/) ) + { + $$refS_protocol .= $1.","; $$refS_operation .= $tab[1].","; + } + # 1000 genome + if($tab[0] =~ /sites/) + { + $tab[0] =~ /\w+_(\w+)\.sites.(\d+)_(\d+)\.txt/; + my ($dbName, $year, $month) = ($1, $2, $3); + $dbName =~ tr/A-Z/a-z/; + + # convert the month number into the month name + ConvertMonth(\$month); + + my $AVdbName_final = "1000g".$year.$month."_".$dbName; + $$refS_protocol .=$AVdbName_final.","; $$refS_operation .= $tab[1].","; + } + # ESP + if( ($tab[0] =~ /esp/) || ($tab[0] =~ /sift/) || ($tab[0] =~ /pp2/) ) + { + $tab[0] =~ /\w+_(\w+)_(\w+)\.txt/; + my $AVdbName_final = $1."_".$2; + $$refS_protocol .=$AVdbName_final.","; $$refS_operation .= $tab[1].","; + } + } + close F1; + + sub ConvertMonth + { + my ($refS_month) = @_; + + if($$refS_month == 1) { $$refS_month = "janv"; } + elsif($$refS_month == 2) { $$refS_month = "feb"; } + elsif($$refS_month == 3) { $$refS_month = "mar"; } + elsif($$refS_month == 4) { $$refS_month = "apr"; } + elsif($$refS_month == 5) { $$refS_month = "may"; } + elsif($$refS_month == 6) { $$refS_month = "jun"; } + elsif($$refS_month == 7) { $$refS_month = "jul"; } + elsif($$refS_month == 8) { $$refS_month = "aug"; } + elsif($$refS_month == 9) { $$refS_month = "sept"; } + elsif($$refS_month == 10) { $$refS_month = "oct"; } + elsif($$refS_month == 11) { $$refS_month = "nov"; } + elsif($$refS_month == 12) { $$refS_month = "dec"; } + else { print STDERR "Month number don't considered\n"; exit; } + } + } +} + +### Add the minimum of annotations (refGene + strand + context) +sub annotateAV_min +{ + my ($inputFile, $output) = @_; + + if(!-e $path_AVDB) { print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n"; print STDERR "Please install the database for this genome before running Annovar\n"; exit; } + + # Extract the name of the databases + my ($protocol, $operation) = ("refGene", "g"); + + `table_annovar.pl $inputFile $path_AVDB -buildver $refGenome -protocol $protocol -operation $operation -remove -nastring NA -otherinfo -outfile $output > $folderMutAnalysis/log_annovar.txt 2>&1`; +} + +sub RecoverStrand +{ + my ($input, $headerOriginalFile, $pathDB, $refGenome, $output, $refS_lengthAVheader) = @_; + + my ($chr_value, $start_value, $ref_value, $alt_value, $func_value, $geneSymbol_value) = ("", "", "", "", "", "", "", ""); + + $chr_value = recoverNumCol($input, "Chr"); + $start_value = recoverNumCol($input, "Start"); + $ref_value = recoverNumCol($input, "Ref"); + $alt_value = recoverNumCol($input, "Alt"); + $func_value = recoverNumCol($input, "Func.refGene"); + $geneSymbol_value = recoverNumCol($input, "Gene.refGene"); + + #################### Convert the input file into a hash table + my %h_inputFile = (); + open(F1, $input) or die "$!: $input\n"; + my $annovar_header = <F1>; + + while(<F1>) + { + $_ =~ s/[\r\n]+$//; + my @tab = split("\t", $_); + + # In COSMIC the chromosome X and Y are annotated 23 and 24 + my $chr = ""; + if($tab[$chr_value] eq "chr23") { $chr = "chrX"; } + elsif($tab[$chr_value] eq "chr24") { $chr = "chrY"; } + elsif($tab[$chr_value] eq "chr25") { $chr = "chrM"; } + else { $chr = $tab[$chr_value]; } + + # Verify if the element exists + if($chr eq "") { print "Error RecoverStrand: The chromosome value is nor defined for $_\n"; exit; } + if(! exists $tab[$start_value]) { print "Error RecoverStrand: The start value is nor defined for $_\n"; exit; } + if(! exists $tab[$ref_value]) { print "Error RecoverStrand: The reference value is nor defined for $_\n"; exit; } + if(! exists $tab[$alt_value]) { print "Error RecoverStrand: The alternate value is nor defined for $_\n"; exit; } + if(! exists $tab[$func_value]) { print "Error RecoverStrand: The functional value is nor defined for $_\n"; exit; } + if(! exists $tab[$geneSymbol_value]) { print "Error RecoverStrand: The gene symbol value is nor defined for $_\n"; exit; } + + my $geneSymbol = ""; + ######## For the splicing annotation we separate the gene symbol from the aa change + if($tab[$func_value] eq "splicing") + { + if($tab[$geneSymbol_value] =~ /(.+)\((.+)\)/) { $geneSymbol = $1; } + else { $geneSymbol = $tab[$geneSymbol_value]; } + } + else { $geneSymbol = $tab[$geneSymbol_value]; } + + push(@{$h_inputFile{"$chr:$tab[$start_value]:$tab[$start_value]:$tab[$ref_value]:$tab[$alt_value]:$geneSymbol"}}, $_); + } + close F1; + + # print "\t\tRecoverStrand: $input\n"; + + #################### Convert the database file into a hash table + my %h_database = (); + my ($db_geneSymbol_value, $db_strandInfo_value, $db_chr_value) = (12, 3, 2); + + my $folderNameDB = $refGenome."db"; + my $fileNameDB = $refGenome."_refGene.txt"; + + open(F1, "$pathDB/$fileNameDB") or die "$!: $pathDB/$fileNameDB\n"; + while(<F1>) + { + $_ =~ s/[\r\n]+$//; + my @tab = split("\t", $_); + my $strand = ""; + $strand = $tab[$db_strandInfo_value]; + if($strand eq "") { print STDERR "Error: the strand orientation is not specify in the database refGene\n$_\n"; exit; } + else + { + # Some genes have several strand orientation, keep the first in the database + if(! exists $h_database{"$tab[$db_geneSymbol_value]:$tab[$db_chr_value]"}) { $h_database{"$tab[$db_geneSymbol_value]:$tab[$db_chr_value]"} = $strand; } + } + } + close F1; + + #################### Parse the two hash tables for recover the strand information + open(OUT, ">", $output) or die "$!: $output\n"; + + + ## Add the header only for the firts part of the files + if($input =~ /\-aa/) + { + my @tabHeaderInput = ""; + $annovar_header =~ s/[\r\n]+$//; @tabHeaderInput = split("\t", $annovar_header); + # Save the length of the Annovar header for the next function (RecoverGenomicSequence) + $$refS_lengthAVheader = $#tabHeaderInput; + + # Print the Annovar header until the column before OtherInfo + print OUT "$tabHeaderInput[0]"; + for(my $i=1; $i<$#tabHeaderInput; $i++) { print OUT "\t$tabHeaderInput[$i]"; } + print OUT "\tStrand"; + print OUT "\t",$headerOriginalFile; + } + + + # Timer for comparing the number of SNVs present in the hash table + my $timerUniqueSNVs = 0; + # Timer for comparing the number of SNVs with the strand orientation + my $timerSNVsStrand = 0; + + foreach my $kFile (sort keys %h_inputFile) + { + my $test = 0; + my @tab = split(":", $kFile); + + # Sometimes the line is not printed correctely !!!!! :@ + my @tHeaderInput = split("\t", $annovar_header); my @lengthLine = split("\t", $h_inputFile{$kFile}[0]); + my @tHeaderOriginalFile = split("\t", $headerOriginalFile); + my $lengthHeader = @tHeaderInput + (scalar(@tHeaderOriginalFile)-1) ; my $lengthLine = @lengthLine; + + # Save the length of the Annovar header for the next function (RecoverGenomicSequence) + $$refS_lengthAVheader = $#tHeaderInput; + + foreach my $kDB (sort keys %h_database) + { + if("$tab[5]:$tab[0]" eq $kDB) + { + if($lengthHeader != $lengthLine) { print STDERR "Error Recover Strand the length of the current line is not valid!!!!!\nExpected length: $lengthHeader\tlength of the line: $lengthLine\n$h_inputFile{$kFile}[0]\n"; exit; } + + foreach my $line (@{$h_inputFile{$kFile}}) + { + my @tab = split("\t", $line); + my $j = 0; + + for(my $i=0; $i<$#tHeaderInput; $i++) { print OUT $tab[$i],"\t"; $j=$i } + print OUT $h_database{$kDB}; + for(my $i=$j+1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; } + print OUT "\n"; + } + $timerSNVsStrand++; + $test = 1; last; + } + } + # The strand orientation isn't defined + if($test == 0) + { + my @tHeaderInput = split("\t", $annovar_header); + foreach my $line (@{$h_inputFile{$kFile}}) + { + my @tab = split("\t", $line); + my $j = 0; + for(my $i=0; $i<$#tHeaderInput; $i++) { print OUT $tab[$i],"\t"; $j=$i } + print OUT "NA"; + for(my $i=$j+1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; } + print OUT "\n"; + } + $timerSNVsStrand++; + } + $timerUniqueSNVs++; + } + close OUT; + + # print "Strand orientation recover for $timerSNVsStrand SNVs out of $timerUniqueSNVs uniques\n"; +} + +sub RecoverGenomicSequence +{ + my ($inputFile, $length_AVheader, $intervalEnd, $referenceGenome, $pathToRefSeq, $output) = @_; + + ############ 1) Transform the input file in a hash table: one for recover the sequence context and one for keeping the original file + my %h_inputFileForSeqContext = (); my %h_inputFile = (); + my $header = ""; + CreateHashTable_from_InputFile($inputFile, $length_AVheader, \$header, $intervalEnd, \%h_inputFileForSeqContext, \%h_inputFile); + + sub CreateHashTable_from_InputFile + { + my ($input, $length_AVheader, $refS_header, $intervalEnd, $refH_inputFileForSeqContext, $refH_inputFile) = @_; + + my ($chr_value, $start_value, $strand_value) = (0, 1, $length_AVheader); + + my $countregion = 0; + my %allchr = (); + + open(F1, $input) or die "$!: $input\n"; + if($input =~ /\-aa/) { $$refS_header = <F1>; } + + while(<F1>) + { + $_ =~ s/[\r\n]+$//; + my @tab = split("\t", $_); + + my $name = "$tab[$chr_value]:$tab[$start_value]"; + my $start = $tab[$start_value] - $intervalEnd; + my $end = $tab[$start_value] + $intervalEnd; + + $start--; #make zero-start coordinate, to be consistent with UCSC + my $exonpos = "$tab[$chr_value]:$start"; + + push @{$refH_inputFileForSeqContext->{$tab[$chr_value]}}, [$name, $start, $end, $tab[$strand_value], $exonpos]; + push(@{$refH_inputFile->{"$tab[$chr_value]\t$start\t$end"}}, $_); + $countregion++; + $allchr{$tab[$chr_value]}++; + } + close F1; + } + + ############ 2) Extract the sequence context from the hash table + my %h_allRegionSeqContext = (); + my $refSeq = $pathToRefSeq; + Extract_SequenceContext(\%h_inputFileForSeqContext, $referenceGenome, $refSeq, \%h_allRegionSeqContext); + + sub Extract_SequenceContext + { + my ($refH_allRegion, $referenceGenome, $refSeq, $refH_allRegionSeqContext) = @_; + + my $folderDB = $referenceGenome."db"; + my $folderSeq = $referenceGenome."_seq"; + my $seqdir = "$refSeq/$folderSeq"; + + my %seqhash = (); #database sequence for each chromosome + my %name_seq = (); #sequence for each region + my (%seqlen, %discordlen, %badorf); #store the length of each sequence, and the ID of sequences with discordant length, ORF contains stop codon + my ($count_success, @failure) = (0); + + for my $curchr (sort keys $refH_allRegion) + { + my ($seqid, $curseq) = ('', ''); + my $fastafile = ""; + if ($curchr =~ m/^chr/) + { + %seqhash = (); #clear the seqhash storage + $fastafile = "$seqdir/$curchr.fa"; #by default, all FASTA files should be saved at fastadir, with the same name + } + else + { + %seqhash = (); #clear the seqhash storage + $fastafile = "$seqdir/chr$curchr.fa"; #by default, all FASTA files should be saved at fastadir, with the same name + } + if (not -e $fastafile) { #to handle cases where no "chr" prefix is given + print "WARNING: the FASTA file $curchr.fa cannot be retrieved from the specified directory $seqdir. Sequences in this chromosome will not be processed\n"; + next; + } + + if (not %seqhash) + { + open (FASTA, $fastafile) or print "WARNING: cannot read from FASTA file $fastafile so sequences in $curchr will not be processed: $!\n" and next; + while (<FASTA>) + { + if (m/^>(\S+)/) + { + $seqid and $seqhash{$seqid} = $curseq; #finish reading the sequence for seqid and save it + $seqid = $1; + $curseq = ''; + } + else + { + s/[\r\n]+$//; + $curseq .= uc $_; #only use upper case characters + } + } + close FASTA; + $seqhash{$seqid} = $curseq; + } + if (not $seqhash{$curchr}) + { + #this chromosome just do not have FASTA sequences (maybe users used a wrong seqdir + print "WARNING: Unable to retrieve regions at $curchr due to lack of sequence information\n"; + next; + } + + for my $i (0 .. @{$refH_allRegion->{$curchr}}-1) + { + my ($name, $start, $end, $strand, $exonpos) = @{$refH_allRegion->{$curchr}[$i]}; + my @start = split (/,/, $start); + my @end = split (/,/, $end); + my $seq; + for my $i (0..@start-1) + { + if ($start[$i] >= length ($seqhash{$curchr})) + { + #here there must be an annotation error in user-specified gene/region definition file + print "WARNING: Ignoring the start position start=$start[$i] since it is longer than the $curchr sequence (length=" , length($seqhash{$curchr}), ")\n"; + undef $seq; + last; + } + $seq .= substr ($seqhash{$curchr}, $start[$i], $end[$i]-$start[$i]); + } + + if (defined $seq) + { + if (defined $seqlen{$name}) + { + $seqlen{$name} != length ($seq) and warn "WARNING: the sequence $name was found more than once with different sequence lengths\n"; + $seqlen{$name} != length ($seq) and $discordlen{$name}++; + } + else { $seqlen{$name} = length ($seq); } + + $name_seq{$name, $exonpos} = $seq; + $count_success++; + + # Put the sequence context in a hash table for Write the result after + ## Some sequence context are NNNNNN or empty + if( ($seq ne "NA") && ($seq =~ /N/i) ) { $refH_allRegionSeqContext->{"$curchr\t$start\t$end"} = "NA"; } + else { $refH_allRegionSeqContext->{"$curchr\t$start\t$end"} = $seq; } + } + else + { + print "WARNING: DNA sequence for $name cannot be inferred\n"; + push @failure, $name; + } + } + } # End for $curchr + } + + ############ 3) Create a file with the sequence context + WriteFile_SeqContext($inputFile, $length_AVheader, \%h_inputFile, $header, \%h_allRegionSeqContext, $output); + + sub WriteFile_SeqContext + { + my ($inputFile, $length_AVheader, $refH_InputFile, $header, $refH_allRegionSeqContext, $output) = @_; + + open(OUT, ">", $output) or die "$!: $output\n"; + + ## Add the header only for the firts part of the files + if($inputFile =~ /\-aa/) + { + my @tabHeaderInput = ""; + + $header =~ s/[\r\n]+$//; @tabHeaderInput = split("\t", $header); + # Print the Annovar header until the column before OtherInfo + print OUT "$tabHeaderInput[0]"; + my $j = 0; + for(my $i=1; $i<$length_AVheader+1; $i++) { print OUT "\t$tabHeaderInput[$i]"; $j=$i; } + print OUT "\tcontext"; + for(my $i=$j+1; $i<=$#tabHeaderInput; $i++) { print OUT "\t$tabHeaderInput[$i]"; } + print OUT "\n"; + } + + foreach my $k_hFile (sort keys $refH_InputFile) + { + foreach my $k_allRegonSeqContext (sort keys $refH_allRegionSeqContext) + { + if($k_hFile eq $k_allRegonSeqContext) + { + my $j=0; + + for(my $k=0; $k<=$#{$refH_InputFile->{$k_hFile}};$k++) + { + my @tab = split("\t", ${$refH_InputFile->{$k_hFile}}[$k]); + + for(my $i=0; $i<$length_AVheader+1; $i++) { print OUT $tab[$i],"\t"; $j=$i; } + print OUT $refH_allRegionSeqContext->{$k_allRegonSeqContext}; + for(my $i=$j+1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; } + print OUT "\n"; + } + last; + } + } + } + close OUT; + } +} + +sub CombinedTempFile +{ + my ($folderTempFile, $output) = @_; + + my $cmd_cat_mt_results = "cat "; + + foreach my $file (`ls $folderTempFile/*.txt`) + { + chomp($file); + $cmd_cat_mt_results = $cmd_cat_mt_results." $file"; + } + $cmd_cat_mt_results = $cmd_cat_mt_results." > $output"; + `$cmd_cat_mt_results`; +} + + +sub recoverNumCol +{ + my ($input, $name_of_column) = @_; + + open(F1,$input) or die "recoverNumCol: $!: $input\n"; + # For having the name of the columns + my $search_header = <F1>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header); + close F1; + # The number of the column + my $name_of_column_NB = "toto"; + for(my $i=0; $i<=$#tab_search_header; $i++) + { + if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; last; } + } + if($name_of_column_NB eq "toto") { print STDERR "Error recoverNumCol(): the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; } + else { return $name_of_column_NB; } +} + + + + +=head1 NAME + +mutspec-Annot + +=head1 SYNOPSIS + + mutspecannot.pl [arguments] <query-file> + + <query-file> can be a folder with multiple VCF or a single VCF + + Arguments: + -h, --help print help message + -m, --man print complete documentation + -v, --verbose use verbose output + --refGenome the reference genome to use + --interval <interger> the number of bases for the sequence context + -o, --outfile <string> output directory for the result. If none is specify the result will be write in the same directory as the input file + -AVDB --pathAnnovarDB <string> the path to Annovar database and the files with the chromosome size + --pathAVDBList the path to the list of AV databases installed + -temp --pathTemporary <string> the path for saving the temporary files + --fullAnnotation <string> recover all Annovar annotations (yes) or only the minimum for MutSpec-Stat (no) + + +Function: automatically run a pipeline on a list of variants and annote them using Annovar + + Example: # Annotation only + mutspecannot.pl --refGenome hg19 --interval 10 --outfile output_directory --pathAnnovarDB path_to_annovar_database --pathAVDBList path_to_the_list_of_annovar_DB --temp path_to_temporary_directory --fullAnnotation yes|no input + + + Version: 02-2016 (Feb 2016) + + +=head1 OPTIONS + +=over 8 + +=item B<--help> + +print a brief usage message and detailed explanation of options. + +=item B<--man> + +print the complete manual of the program. + +=item B<--verbose> + +use verbose output. + + +=item B<--refGenome> + +the reference genome to use, could be hg19 or mm9. + +=item B<--interval> + +the number of bases surrounding the mutated bases, for the sequence context analysis. + +=item B<--outfile> + +the directory of output file names. If it is nor specify the same directory as the input file is used. + +=item B<--pathAnnovarDB> + +the path to the directory containing the Annovar databases and the files with the chromosome size. + +=item B<--pathAVDBList> + +the path to a texte file containing the list of the Annovar databases installed. + +=item B<--pathTemporary> + +the path for saving temporary files generated by the script. +If any is specify a temporary folder is created in the same directory where the script is running. +Deleted when the script is finish + +=item B<--fullAnnotation> + +Use all Annovar databases for the annotation (set to yes) or only refGene + strand + context (set to no) for having a quicker annotation (for large file with million of lines) + +=head1 DESCRIPTION + +MutSpec-Annot is a perl script for added annotations on a list of genetic variants generated with NGS. +Functional annotations are added using ANNOVAR software. Strand transcript orientation is added using RefSeq database and the sequence context for x bases flanking the variant positions is also added. +A text tab delimited file is produced. + +=cut
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecAnnot.xml Tue Apr 19 03:07:11 2016 -0400 @@ -0,0 +1,172 @@ +<tool id="mutSpecannot" name="MutSpec Annot" version="0.1" hidden="false"> +<description>Annotate variants with ANNOVAR and other databases</description> + +<requirements> + <requirement type="set_environment">SCRIPT_PATH</requirement> + <requirement type="package" version="5.18.1">perl</requirement> +</requirements> + +<command interpreter="bash"> + mutspecAnnot_wrapper.sh + $output + --refGenome ${refGenome} + --AVDB ${refGenome.fields.path} + --interval $interval + --fullAnnotation ${annotation_type} + $input +</command> + +<inputs> + <param name="input" type="data" format="txt" label="Input file" help="Select a single file, multiple files or a dataset collection"/> + + <param name="refGenome" type="select" label="Reference genome" help="Select the reference genome that was used for generating your data"> + <options from_data_table="annovar_index" /> + </param> + + <param name="interval" type="text" value="10" label="Sequence context of variants" help="Number of retrieved bases that flank variants in 5' and 3'"/> + + <param name="annotation_type" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Complete annotations" help="Select No if you have a file with millions of variants and you are just interested in having a quick overview of the mutational spectrum. Only the annotation from refGene, the strand orientation and the sequence context will be added." /> + +</inputs> + +<outputs> + <data name="output" type="data" format="tabular" label="${input.name} annotated" /> +</outputs> + + +<stdio> + <regex match="ANNOVAR LOG FILE" + source="stdout" + level="fatal" + description="Read Annovar log file for more information" /> +</stdio> + +<help> + +**What it does** + +MutSpect-Annot provides functional annotations from `ANNOVAR software`__ (June 2015 version is provided here), as well as the strand transcript orientation (from refGene database) and sequence context of variants (extrated from the reference genome selected). + +.. __: http://www.openbioinformatics.org/annovar/ + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Input formats** + +MutSpect-Annot accepts files in VCF (version 4.1) or in tab-delimited (TAB) format. + +.. class:: infomark + +TIP: If your data is not TAB delimited, use *Text manipulation -> convert* + +.. class:: warningmark + +Filenames must be <= 31 characters. + +.. class:: warningmark + +These files should contain at least four columns describing for each variant, the chromosome number, the start genomic position, the reference allele and the alternate allele + +.. class:: warningmark + +The tool supports different column names (**names are case-sensitive**) depending on the source file as follows: + +**mutect** : contig position ref_allele alt_allele + +**vcf** : CHROM POS REF ALT + +**cosmic** : Mutation_GRCh37_chromosome_number Mutation_GRCh37_genome_position Description_Ref_Genomic Description_Alt_Genomic + +**icgc** : chromosome chromosome_start reference_genome_allele mutated_to_allele + +**tcga** : Chromosome Start_position Reference_Allele Tumor_Seq_Allele2 + +**ionTorrent** : chr Position Ref Alt + +**proton** : Chrom Position Ref Variant + +**varScan2** : Chrom Position Ref VarAllele + +**annovar** : Chr Start Ref Obs + +**custom** : Chromosome Start Wild_Type Mutant + +.. class:: infomark + +For MuTect output files, only confident calls are considered (variants containing the string REJECT in the judgement column are not annotated and excluded from the MutSpect-Annot output) as other calls are very likely to be dubious calls or artefacts. + +.. class:: infomark + +For COSMIC and ICGC files, variants are reported on several transcripts. These duplicate variants need to be remove before annotated the file. + +.. class:: warningmark + +If multiple input files are specified they should be from the **same genome build** + + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Output** + +The output is a tabular text file, that contains the retrieved annotations in the first columns and all columns from the original file at the end. + +.. class:: infomark + +Variants on chromosome M and random chromosomes are not considered for the annotation and excluded from MutSpec-Annot output. + +The following annotations are retrieved: + +**ANNOVAR annotations** + +An example of annotations retrieved by the tool. + +Gene-based: RefSeqGene, UCSC Known Gene and Ensembl Gene + +Region-based: localization of the variant on cytogenetic band (cytoBand), variant reported in Genome-Wide association studies (gwasCatalog) and variant mapped to segmental duplications (genomicSuperDups) + +Filter-based: + + - dbSNP: For human genome there is two versions available: the defaul version (snp) and a pre-filtered version (snpNonFlagged). In the pre-filtered version all SNPs ‹ 1% minor allele frequency (MAF) (or unknown), mapping only once to reference assembly, or flagged in dbSnp as clinically associated are removed from the full dbSNP database and therefore not present in this version. + + - 1000 Genomes Project (ALL, AFR (African), AMR (Admixed American), EAS (East Asian), EUR (European), SAS (South Asian)) + + - ESP: Exome Sequencing Project (ALL, AA (African American), EA (European American)) + + - ExAC: Exome Aggregation Consortium (ALL, AFR (African), AMR (Admixed American), EAS (East Asian), FIN (Finnish), NFE (Non-finnish European), OTH (other), SAS (South Asian)) + + - LJB26: SIFT, PolyPhen-2 (HDIV and HVAR) + +**Transcript orientation** + +The strand annotation corresponding to transcript orientation within genic regions is recovered from RefSeqGene database. + +**Sequence context** + +Flanking bases in both sides in 5' and 3' of the variant position retrieved from the reference genome used. + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Example** + +Annotate the following file:: + + Chromosome Start_Position End_Position Reference_Allele Tumor_Seq_Allele2 + chr7 121717919 121717920 - G + chr1 230846235 230846235 T A + chr14 33290999 33290999 A G + chr12 8082458 8082458 C T + chr4 70156391 70156391 T C + +Will produce:: + + Chr Start End Ref Alt Func.refGene Gene.refGene ExonicFunc.refGene AAChange.refGene genomicSuperDups snp138 1000g2014oct_all esp6500si_all Strand context Chromosome Start_Position End_Position Reference_Allele Tumor_Seq_Allele2 + chr7 121717919 121717920 - G exonic AASS frameshift insertion AASS:NM_005763:exon23:c.2634dupC:p.A879fs NA rs147476318 NA NA - GCG chr7 121717919 121717920 - G + chr1 230846235 230846235 T A exonic AGT nonsynonymous SNV AGT:NM_000029:exon2:c.A362T:p.H121L NA NA NA NA - GTG chr1 230846235 230846235 T A + chr14 33290999 33290999 A G exonic AKAP6 nonsynonymous SNV AKAP6:NM_004274:exon13:c.A3980G:p.D1327G NA NA NA NA + GAC chr14 33290999 33290999 A G + chr12 8082458 8082458 C T exonic SLC2A3 nonsynonymous SNV SLC2A3:NM_006931:exon6:c.G683A:p.R228Q NA rs200481428 0.000199681 NA - CCG chr12 8082458 8082458 C T + chr4 70156391 70156391 T C exonic UGT2B28 nonsynonymous SNV UGT2B28:NM_053039:exon5:c.T1172C:p.V391A score=0.949699;Name=chr4:70035680 NA 0.000199681 NA + GTA chr4 70156391 70156391 T C + + +</help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecAnnot_wrapper.sh Tue Apr 19 03:07:11 2016 -0400 @@ -0,0 +1,26 @@ +#!/bin/bash + +output=$1;shift +refg=$2 +input=${9} + +command -v table_annovar.pl >/dev/null 2>&1 || { + echo "ERROR : table_annovar.pl not found. Add annovar scripts to your galaxy path !" ; + return 1 ; +} + +mkdir out +name=${input##*/} +name=${name%%.*} + +perl $SCRIPT_PATH/mutspecAnnot.pl \ + --outfile out \ + --pathAVDBList $SCRIPT_PATH \ + --temp "./temp" \ + $* 2>&1 + +ls out/Mutational_Analysis/Annovar/ +cp out/Mutational_Analysis/Annovar/${name}.${refg}_multianno.txt $output + +exit 0 +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecCompare.xml Tue Apr 19 03:07:11 2016 -0400 @@ -0,0 +1,109 @@ +<?xml version="1.0"?> +<tool id="mutSpeccompare" name="MutSpec Compare" version="0.0.1"> +<description>Compare signatures with the cosine similarity method</description> + +<requirements> + <requirement type="set_environment">SCRIPT_PATH</requirement> + <requirement type="package" version="3.1.2">R</requirement> + <requirement type="package" version="0.1">mutspec</requirement> +</requirements> + +<command interpreter="bash"> + mutspecCompare_wrapper.sh + $newsign + $output + #if $refSignatureSource.source == "fromtable": + \$SCRIPT_PATH/Frequency-COSMICv72-Hupki.txt + #else + ${refSignatureSource.h_publish} + #end if +</command> + +<inputs> + <conditional name="refSignatureSource"> + <param name="source" type="select" label="Reference signatures" help="You may select the provided file that includes published signatures (see details further below) or your own reference file"> + <option value="fromtable">Use COSMICv72_Hupki2014</option> + <option value="history">Use one from my history</option> + </param> + <when value="fromtable"> + <options from_data_table="published_signature_matrice" /> + </when> + <when value="history"> + <param name="h_publish" type="data" format="tabular" label="Select a file from my history" help="Matrix correctly formated (see details further below)"/> + </when> + </conditional> + + <param name="newsign" type="data" format="html" label="Newly identified signature" help="Select an output of the tool MutSpec-NMF"/> + +</inputs> + +<outputs> + <data name="output" format="html" label="Similarity_Matrix on dataset ${newsign.name}" /> +</outputs> + + +<help> + +**What it does** + +Compare two matrices containing published and newly identified mutation signatures using the `cosine similarity`__ method as already used by `Alexandrov et al. 2013`__, `Olivier et al. 2014`__ or `Schulze et al. 2015`__ + +.. __: http://en.wikipedia.org/wiki/Cosine_similarity + +.. __: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3588146/ + +.. __: http://www.nature.com/srep/2014/140327/srep04482/full/srep04482.html + +.. __: http://www.nature.com/ng/journal/v47/n5/fig_tab/ng.3252_SF3.html + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Output** + +A HTML page displaying a heatmap representing the similarity between the new signatures and the published ones. + +Values close to 1 (red) indicate a high similarity between the signatures. + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Published signatures** + +The reference signatures matrix (COSMICv72-Hupki2014 matrix) includes + +1. The 30 signatures published in `COSMIC database, v72`__ + +2. The 4 experimental signatures obtained in mouse cells for AA, MNNG, BaP and AID that were published in `Olivier et al. 2014`__ + + +.. __: http://cancer.sanger.ac.uk/cosmic/signatures + +.. __: http://www.nature.com/srep/2014/140327/srep04482/full/srep04482.html + + + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Example** + +Matrix of known signatures + ++-------------------+---------------+-----------------------+--------------+--------------+ +| Substitution Type | Trinucleotide | Somatic Mutation Type | Signature 1 | Signature 2 | ++===================+===============+=======================+==============+==============+ +| C>A | ACA | A[C>A]A | 0.0110983262 | 0.0006827082 + ++-------------------+---------------+-----------------------+--------------+--------------+ +| C>A | ACC | A[C>A]C | 0.0091493407 | 0.0006191072 + ++-------------------+---------------+-----------------------+--------------+--------------+ +| C>A | ACG | A[C>A]G | 0.0014900705 | 0.000099279 + ++-------------------+---------------+-----------------------+--------------+--------------+ +| C>A | ACT | A[C>A]T | 0.0062338852 | 0.0003238914 + ++-------------------+---------------+-----------------------+--------------+--------------+ +| C>A | CCA | C[C>A]A | 0.0065958701 | 0.000677445 + ++-------------------+---------------+-----------------------+--------------+--------------+ +| C>A | CCC | C[C>A]C | 0.0073423678 | 0.000213681 + ++-------------------+---------------+-----------------------+--------------+--------------+ + + +</help> + +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecCompare_wrapper.sh Tue Apr 19 03:07:11 2016 -0400 @@ -0,0 +1,29 @@ +#!/bin/bash + +newsign=$1 +html=$2 +ref=$3 + +output_dir=${html%%.*}_files + +matrix=${newsign%.*}_files/NMF/Files/MatrixW-Normto100.txt + +mkdir $output_dir + +Rscript --no-save $SCRIPT_PATH/R/compareSignature_Galaxy.r $ref $matrix $output_dir 2>&1 + +# Convert the image into png format +cd $output_dir + +echo "<html><body>" >> $html +echo "<center> <h2> Cosine similarity comparison </h2> </center>" >> $html + +echo "<table>" >> $html +echo "<tr> <td> <center> <br/> <a href="Similarity_Matrix.txt">Similarity_Matrix.txt</a> </center> </td> </tr>" >> $html +echo "<tr>" >> $html +echo "<td><a href="Similarity_Matrix.png">" >> $html +echo "<img width="1000" src="Similarity_Matrix.png" /></a></td>" >> $html +echo "</tr>" >> $html +echo "</table>" >> $html + +exit 0
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecFilter.pl Tue Apr 19 03:07:11 2016 -0400 @@ -0,0 +1,378 @@ +# !/usr/bin/perl + +#-----------------------------------# +# Author: Maude # +# Script: mutspecFilter.pl # +# Last update: 18/03/16 # +#-----------------------------------# + +use strict; +use warnings; +use Getopt::Long; +use Pod::Usage; +use File::Basename; # my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/); +use File::Path; + +################################################################################################################################################################################ +# Filter an Annotaed file with Annovar # +################################################################################################################################################################################ + +our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested. +our ($dbSNP_value, $segDup, $esp, $thG) = (0, 0, 0, 0); # For filtering agains the databases dbSNP, genomic duplicate segments, Exome Sequencing Project and 1000 genome. +our ($output, $refGenome) = ("", ""); # The path for saving the result; The reference genome to use. +our ($listAVDB) = "empty"; # Text file with the list Annovar databases. +our ($dir) = ""; + +GetOptions('dir|d=s'=>\$dir,'verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'dbSNP=i'=>\$dbSNP_value, 'segDup'=>\$segDup, 'esp'=>\$esp, 'thG'=>\$thG, 'outfile|o=s' => \$output, 'refGenome=s'=>\$refGenome, 'pathAVDBList=s' => \$listAVDB) or pod2usage(2); + +our ($input) = @ARGV; + +pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help); +pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man); +pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 0); # No argument is pass to the command line print the usage of the script +pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 2); # Only one argument is expected to be pass to @ARGV (the input) + + + +# If the dbSNP value is not equal to zero filter using the dbSNP column specify +our $dbSNP = 0; +if($dbSNP_value > 0) { $dbSNP = 1; } + + +############ Check flags ############ +if($listAVDB eq "empty") { $listAVDB = "$dir/${refGenome}_listAVDB.txt" } + +# Zero databases is specified +if( ($dbSNP == 0) && ($segDup == 0) && ($esp == 0) && ($thG == 0) ) +{ + print STDERR "There is no databases selected for filtering against!!!\nPlease choose at least one between dbSNP, SegDup, ESP (only for human genome) or 1000 genome (only for human genome)\n"; + exit; +} + + + +############ Recover the name of the databases to filter against ############ +my ($segDup_name, $espAll_name, $thousandGenome_name) = ("", "", ""); +my @tab_protocol = (); + +if( ($segDup == 1) || ($esp == 1) || ($thG == 1) ) +{ + ### Recover the name of the column + my $protocol = ""; + ExtractAVDBName($listAVDB, \$protocol); + @tab_protocol = split(",", $protocol); + + for(my $i=0; $i<=$#tab_protocol; $i++) + { + if($tab_protocol[$i] =~ /genomicSuperDups/) { $segDup_name = $tab_protocol[$i]; } + elsif($tab_protocol[$i] =~ /1000g/) { $thousandGenome_name = $tab_protocol[$i]; } + elsif($tab_protocol[$i] =~ /esp/) { $espAll_name = $tab_protocol[$i]; } + } +} + + +############ Filter the file ############ +filterAgainstPublicDB(); + + +print STDOUT "\tFilter selected\tdbSNP = ".$dbSNP."\tsegDup = ".$segDup."\tesp = ".$esp."\tthG = ".$thG."\n"; + + +sub filterAgainstPublicDB +{ + open(FILTER, ">", "$output") or die "$!: $output\n"; + + open(F1, $input) or die "$!: $input\n"; + my $header = <F1>; print FILTER $header; + while(<F1>) + { + $_ =~ s/[\r\n]+$//; + my @tab = split("\t", $_); + + my ($segDupInfo, $espAllInfo, $thgInfo) = (0, 0 ,0); + + if($segDup == 1) + { + my $segDup_value = recoverNumCol($input, $segDup_name); + $segDupInfo = formatSegDupInfo($tab[$segDup_value]); + # Replace NA by 0 for making test on the same type of variable + $segDupInfo =~ s/NA/0/; + } + if($esp == 1) + { + my $espAll_value = recoverNumCol($input, $espAll_name); + $espAllInfo = $tab[$espAll_value]; + # Replace NA by 0 for making test on the same type of variable + $espAllInfo =~ s/NA/0/; + } + if($thG == 1) + { + my $thousandGenome_value = recoverNumCol($input, $thousandGenome_name); + # Replace NA by 0 for making test on the same type of variable + $thgInfo = $tab[$thousandGenome_value]; + $thgInfo =~ s/NA/0/; + } + + + ############################## + # One Filter # + ############################## + # Remove all the variants present in dbSNP + if( ($dbSNP == 1) && ($segDup==0) && ($esp==0) && ($thG==0) ) { if($tab[$dbSNP_value-1] eq "NA") { print FILTER "$_\n"; } } + # Remove all the variants with a frequency greater than or equal to 0.9 in genomic duplicate segments database + if( ($dbSNP==0) && ($segDup == 1) && ($esp==0) && ($thG==0) ) { if($segDupInfo < 0.9) { print FILTER "$_\n"; } } + # Remove all the variants with greater than 0.001 in Exome sequencing project + if( ($dbSNP==0) && ($segDup==0) && ($esp == 1) && ($thG==0) ) { if($espAllInfo <= 0.001) { print FILTER "$_\n"; } } + # Remove all the variants with greater than 0.001 in 1000 genome database + if( ($dbSNP==0) && ($segDup==0) && ($esp==0) && ($thG == 1) ) { if($thgInfo <= 0.001) { print FILTER "$_\n"; } } + + + ############################# + # Two Filter # + ############################## + if( ($dbSNP==1) && ($segDup==1) && ($esp==0) && ($thG== 0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) ) { print FILTER "$_\n"; } } + if( ($dbSNP==1) && ($segDup==0) && ($esp==1) && ($thG==0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($espAllInfo <= 0.001) ) { print FILTER "$_\n"; } } + if( ($dbSNP==1) && ($segDup==0) && ($esp==0) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($thgInfo <= 0.001) ) { print FILTER "$_\n"; } } + + if( ($dbSNP==0) && ($segDup==1) && ($esp==1) && ($thG==0) ) { if( ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) ) { print FILTER "$_\n"; } } + if( ($dbSNP==0) && ($segDup==1) && ($esp==0) && ($thG==1) ) { if( ($segDupInfo < 0.9) && ($thgInfo <= 0.001) ) { print FILTER "$_\n"; } } + + if( ($dbSNP==0) && ($segDup==0) && ($esp==1) && ($thG==1) ) { if( ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) ) { print FILTER "$_\n"; } } + + + ############################# + # Three Filter # + ############################## + if( ($dbSNP==1) && ($segDup==1) && ($esp==1) && ($thG==0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) ) + { print FILTER "$_\n"; } } + if( ($dbSNP==1) && ($segDup==1) && ($esp==0) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($thgInfo <= 0.001) ) + { print FILTER "$_\n"; } } + if( ($dbSNP==1) && ($segDup==0) && ($esp==1) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) ) + { print FILTER "$_\n"; } } + if( ($dbSNP==0) && ($segDup==1) && ($esp==1) && ($thG==1) ) { if( ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) ) + { print FILTER "$_\n"; } } + + + ############################# + # FOUR Filter # + ############################## + if( ($dbSNP==1) && ($segDup==1) && ($esp==1) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) ) + { print FILTER "$_\n"; } } + + } + close F1; close FILTER; +} + + +sub formatSegDupInfo +{ + my ($segDup_info) = @_; + + if($segDup_info ne "NA") # Score=0.907883;Name=chr9:36302931 + { + my @segDup = split(";", $segDup_info); + $segDup[0] =~ /Score=(.+)/; + return $1; + } + else { return $segDup_info; } +} + + +sub ExtractAVDBName +{ + my ($listAVDB, $refS_protocol) = @_; + + open(F1, $listAVDB) or die "$!: $listAVDB\n"; + while(<F1>) + { + if ($_ =~ /^#/) { next; } + + $_ =~ s/[\r\n]+$//; + my @tab = split("\t", $_); + + # db name like refGenome_dbName.txt + if( ($tab[0] =~ /\w+_(\w+)\.txt/) && ($tab[0] !~ /sites/) && ($tab[0] !~ /esp/) && ($tab[0] !~ /sift/) && ($tab[0] !~ /pp2/) ) + { + my $temp = $1; + if($temp =~ /genomicSuperDups/) { $$refS_protocol .= $temp.","; } + } + # 1000 genome + if($tab[0] =~ /sites/) + { + $tab[0] =~ /\w+_(\w+)\.sites.(\d+)_(\d+)\.txt/; + my ($dbName, $year, $month) = ($1, $2, $3); + $dbName =~ tr/A-Z/a-z/; + + # convert the month number into the month name + ConvertMonth(\$month); + + my $AVdbName_final = "1000g".$year.$month."_".$dbName; + + if($dbName eq "all") { $$refS_protocol .=$AVdbName_final.","; } + } + # ESP + if($tab[0] =~ /esp/) + { + $tab[0] =~ /\w+_(\w+)_(\w+)\.txt/; + my $AVdbName_final = $1."_".$2; + + if($2 eq "all") { $$refS_protocol .=$AVdbName_final.","; } + } + } + close F1; + + sub ConvertMonth + { + my ($refS_month) = @_; + + if($$refS_month == 1) { $$refS_month = "janv"; } + elsif($$refS_month == 2) { $$refS_month = "feb"; } + elsif($$refS_month == 3) { $$refS_month = "mar"; } + elsif($$refS_month == 4) { $$refS_month = "apr"; } + elsif($$refS_month == 5) { $$refS_month = "may"; } + elsif($$refS_month == 6) { $$refS_month = "jun"; } + elsif($$refS_month == 7) { $$refS_month = "jul"; } + elsif($$refS_month == 8) { $$refS_month = "aug"; } + elsif($$refS_month == 9) { $$refS_month = "sept"; } + elsif($$refS_month == 10) { $$refS_month = "oct"; } + elsif($$refS_month == 11) { $$refS_month = "nov"; } + elsif($$refS_month == 12) { $$refS_month = "dec"; } + else { print STDERR "Month number don't considered\n"; exit; } + } +} + + +sub recoverNumCol +{ + my ($input, $name_of_column) = @_; + + # With Annovar updates the databases name changed and are present in an array + if( ref($name_of_column) eq "ARRAY" ) + { + my $test = ""; + my @tab = @$name_of_column; + foreach (@tab) + { + open(F1,$input) or die "$!: $input\n"; + # For having the name of the columns + my $search_header = <F1>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header); + close F1; + # The number of the column + my $name_of_column_NB = "toto"; + for(my $i=0; $i<=$#tab_search_header; $i++) + { + if($tab_search_header[$i] eq $_) { $name_of_column_NB = $i; } + } + if($name_of_column_NB eq "toto") { next; } + else { return $name_of_column_NB; } + } + if($name_of_column eq "toto") { print "Error recoverNumCol: the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; } + } + # Only one name is pass + else + { + open(FT,$input) or die "$!: $input\n"; + # For having the name of the columns + my $search_header = <FT>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header); + close FT; + # The number of the column + my $name_of_column_NB = "toto"; + for(my $i=0; $i<=$#tab_search_header; $i++) + { + if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; } + } + if($name_of_column_NB eq "toto") { print "Error recoverNumCol: the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; } + else { return $name_of_column_NB; } + } +} + +=head1 NAME + +mutspecFilter - Filter a file annotated with MutSpec-Annot tool. Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome obtained from Annovar) will be removed from the input file (with frequency limits described above) + +=head1 SYNOPSIS + + mutspecFilter.pl [arguments] <query-file> + + <query-file> an annotated file + + Arguments: + -h, --help print help message + -m, --man print complete documentation + -v, --verbose use verbose output + --dbSNP <value> filter against dbSNP database. Specify the number of the dbSNP column in the file + --segDup filter against genomic duplicate database + --esp filter against Exome Sequencing Project database (only for human) + --thG filter against 1000 genome database (onyl for human) + -o, --outfile <string> name of output file + --refGenome reference genome to use + --pathAVDBList path to the list of Annovar databases installed + + +Function: Filter out variants present in public databases + + Example: # Filter against dbSNP + mutspecFilter.pl --dbSNP col_number --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input + + # Filter against the four databases + mutspecFilter.pl --dbSNP col_number --segDup --esp --thG --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input + + + Version: 03-2016 (March 2016) + + +=head1 OPTIONS + +=over 8 + +=item B<--help> + +print a brief usage message and detailed explanation of options. + +=item B<--man> + +print the complete manual of the program. + +=item B<--verbose> + +use verbose output. + +=item B<--dbSNP> + +Remove all the variants presents in the dbSNP databases +Specify the number of column containing the annotation +For human and mouse genome + +=item B<--segDup> + +Remove all the variants with a frequency greater or equal to 0.9 in genomic duplicate segments database +For human and mouse genome + +=item B<--esp> + +Remove all the variants with a frequency greater than 0.001 in Exome sequencing project +For human genome only + +=item B<--thG> + +Remove all the variants with a frequency greater than 0.001 in 1000 genome database + +=item B<--refGenome> + +the reference genome to use, could be hg19 or mm9. + +=item B<--outfile> + +the name of the output file + +=item B<--pathAVDBList> + +the path to a texte file containing the list of the Annovar databases installed. + +=back + +=head1 DESCRIPTION + +mutspecFilter - Filter a file annotated with MutSpec-Annot tool. Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome obtained from Annovar) will be removed from the input file (with frequency limits described above) + +=cut
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecFilter.xml Tue Apr 19 03:07:11 2016 -0400 @@ -0,0 +1,97 @@ +<tool id="MutSpecfilter" name="MutSpec Filter" version="0.1" hidden="false"> +<description>Filter out variants present in public databases</description> + +<requirements> + <requirement type="set_environment">SCRIPT_PATH</requirement> + <requirement type="package" version="5.18.1">perl</requirement> +</requirements> + +<command interpreter="perl"> + mutspecFilter.pl + --dir \$SCRIPT_PATH + $segDup + $esp + $thG + #if $FilterdbSNP.dbSNP == "true": + --dbSNP ${FilterdbSNP.column} + #else + --dbSNP 0 + #end if + --refGenome ${refGenome} + --outfile $output + $input +</command> + +<inputs> + <param name="input" type="data" format="txt" label="Input file"/> + + <param name="refGenome" type="select" label="Reference genome" help="All your data should have been annotated with the selected genome"> + <options from_data_table="annovar_index" /> + </param> + + <conditional name="FilterdbSNP"> + <param name="dbSNP" type="boolean" checked="true" truevalue="true" label="Filter against dbSNP database" help="Remove variants with a RS number" /> + <when value="true"> + <param name="column" type="data_column" data_ref="input" label="Select the dbSNP column for filtering" use_header_names="true" help="Select a column name snp or snpNonFlagged" /> + </when> + </conditional> + + + <param name="segDup" type="boolean" checked="true" truevalue="--segDup" falsevalue="" label="Filter against SegDup database" help="Remove variants present at >= 0.9 frequency in the genomic duplicate segments database" /> + <param name="esp" type="boolean" checked="true" truevalue="--esp" falsevalue="" label="Filter against the ESP database" help="Remove variants present at frequency > 0.001 in the Exome Sequencing Project database (only valid for human genomes)" /> + <param name="thG" type="boolean" checked="true" truevalue="--thG" falsevalue="" label="Filter against the 1000g database project" help="Remove variants present at frequency > 0.001 in the 1000 genome database (only valid for human genomes)" /> +</inputs> + +<outputs> + <data type="data" name="output" format="tabular" label="${input.name.split(' ')[0]} filtered" /> +</outputs> + +<help> + +**What it does** + +Filter a file annotated with MutSpec-Annot tool. Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome obtained from Annovar) will be removed from the input file (with frequency limits described above). + +.. class:: warningmark + +The databases ESP and 1000 genome can be used only for human genomes + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Input** + +.. class:: warningmark + +Tab delimited text files generated by MutSpec-Annot tool. + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Output** + +Tab delimited text file filtered for variants considered as neutral polymorphisms. + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Example** + +Filter the following file:: + + Chr Start End Ref Alt Func.refGene Gene.refGene ExonicFunc.refGene AAChange.refGene genomicSuperDups snp138 1000g2014oct_all esp6500si_all Strand context Chromosome Start_Position End_Position Reference_Allele Tumor_Seq_Allele2 + chr7 121717919 121717920 - G exonic AASS frameshift insertion AASS:NM_005763:exon23:c.2634dupC:p.A879fs NA rs147476318 NA NA - GCG chr7 121717919 121717920 - G + chr1 230846235 230846235 T A exonic AGT nonsynonymous SNV AGT:NM_000029:exon2:c.A362T:p.H121L NA NA NA NA - GTG chr1 230846235 230846235 T A + chr14 33290999 33290999 A G exonic AKAP6 nonsynonymous SNV AKAP6:NM_004274:exon13:c.A3980G:p.D1327G NA NA NA NA + GAC chr14 33290999 33290999 A G + chr12 8082458 8082458 C T exonic SLC2A3 nonsynonymous SNV SLC2A3:NM_006931:exon6:c.G683A:p.R228Q NA rs200481428 0.000199681 NA - CCG chr12 8082458 8082458 C T + chr4 70156391 70156391 T C exonic UGT2B28 nonsynonymous SNV UGT2B28:NM_053039:exon5:c.T1172C:p.V391A score=0.949699;Name=chr4:70035680 NA 0.000199681 NA + GTA chr4 70156391 70156391 T C + +Will produce:: + + Chr Start End Ref Alt Func.refGene Gene.refGene ExonicFunc.refGene AAChange.refGene genomicSuperDups snp138 1000g2014oct_all esp6500si_all Strand context Chromosome Start_Position End_Position Reference_Allele Tumor_Seq_Allele2 + chr1 230846235 230846235 T A exonic AGT nonsynonymous SNV AGT:NM_000029:exon2:c.A362T:p.H121L NA NA NA NA - GTG chr1 230846235 230846235 T A + chr14 33290999 33290999 A G exonic AKAP6 nonsynonymous SNV AKAP6:NM_004274:exon13:c.A3980G:p.D1327G NA NA NA NA + GAC chr14 33290999 33290999 A G + chr4 70156391 70156391 T C exonic UGT2B28 nonsynonymous SNV UGT2B28:NM_053039:exon5:c.T1172C:p.V391A score=0.949699;Name=chr4:70035680 NA 0.000199681 NA + GTA chr4 70156391 70156391 T C + + + +</help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecNmf.xml Tue Apr 19 03:07:11 2016 -0400 @@ -0,0 +1,108 @@ +<?xml version="1.0"?> +<tool id="mutSpecnmf" name="MutSpec NMF" version="0.0.1"> +<description>Extract mutation signatures with the Non negative Matrix Factorization algorithm</description> + +<requirements> + <requirement type="set_environment">SCRIPT_PATH</requirement> + <requirement type="package" version="5.18.1">perl</requirement> + <requirement type="package" version="3.1.2">R</requirement> + <requirement type="package" version="1.7.1">numpy</requirement> + <requirement type="package" version="0.1">mutspec</requirement> +</requirements> + +<command interpreter="bash"> + mutspecNmf_wrapper.sh + $html_file + "--nbSign $nbsign" + ${refGenomeSource.source} + #if $refGenomeSource.source == "html": + ${refGenomeSource.reportHTML} + #else + ${refGenomeSource.matrix} + #end if +</command> + +<inputs> + <conditional name="refGenomeSource"> + <param name="source" type="select" label="Input a MutSpec Stats report or a matrix" help="You may select either a report generated by MutSpec-Stats or a tab-delimited text matrix"> + <option value="html">Dataset generated by the tool MutSpec-Stats</option> + <option value="tab">Tab-delimited matrix</option> + </param> + <when value="html"> + <param name="reportHTML" type="data" format="html" label="Input dataset" help="Select a report generated by the MutSpec-Stats tool"/> + </when> + <when value="tab"> + <param name="matrix" type="data" format="tabular" label="Input matrix" help="Select a matrix formatted as shown further below"/> + </when> + </conditional> + <param name="nbsign" type="text" value="2" label="Number of expected signatures" help="min=2" /> +</inputs> + +<outputs> + <data name="html_file" format="html" label="NMF result on ${on_string} ($nbsign signatures)" /> +</outputs> + +<help> + +**What it does** + +Extract mutation signatures composed of 96 SBS types (6 SBS types in their trinucleotide sequence context) using the non-negative matrix (`NMF`__) factorisation algorithm of Brunet with the Kullback-Leibler divergence penalty implemented in a `R package`__. + +.. __: http://www.nature.com/nature/journal/v401/n6755/full/401788a0.html +.. __: http://www.biomedcentral.com/1471-2105/11/367 + + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Input formats** + +The tool accepts a HTML report produces by the tool MutSpec-Stats or a matrix of mutation count in a tab-delimited text file format (see example below). + +.. class:: warningmark + +If the input is a matrix of mutation count, the sum of mutation counts for each row should be not null. + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Output** + +Matrices and graphs representing the composition of the mutation signatures found by NMF (Matrix W) and the contributions of each sample to the signatures (Matrix H). The tool also produces a matrice that can be used with the tool MutSpec-compare for comparing the identified signatures with known signatures. + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Example: matrix of mutation count (96 rows + a header with the samples names)** + ++--------+----------+----------+----------+ +| | Sample_1 | Sample_2 | Sample_3 | ++========+==========+==========+==========+ +|A[C>A]A | 4 | 3 | 1 | ++--------+----------+----------+----------+ +|A[C>T]A | 2 | 1 | 0 | ++--------+----------+----------+----------+ +|A[C>G]A | 13 | 2 | 1 | ++--------+----------+----------+----------+ +|A[T>A]A | 10 | 3 | 6 | ++--------+----------+----------+----------+ +|A[T>C]A | 9 | 6 | 1 | ++--------+----------+----------+----------+ +|A[T>G]A | 2 | 1 | 0 | ++--------+----------+----------+----------+ +| ... | ++--------+----------+----------+----------+ +|T[C>A]T | 5 | 2 | 2 | ++--------+----------+----------+----------+ +|T[C>G]T | 5 | 2 | 0 | ++--------+----------+----------+----------+ +|T[C>T]T | 11 | 4 | 2 | ++--------+----------+----------+----------+ +|T[T>A]T | 3 | 0 | 5 | ++--------+----------+----------+----------+ +|T[T>C]T | 39 | 17 | 1 | ++--------+----------+----------+----------+ +|T[T>G]T | 12 | 8 | 1 | ++--------+----------+----------+----------+ + + +</help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecNmf_wrapper.sh Tue Apr 19 03:07:11 2016 -0400 @@ -0,0 +1,94 @@ +#!/bin/bash + +######################################### +### SPECIFY THE NUMBER OF CPU ### +######################################### +cpu=1 + + + + +html=$1;shift +parameters=$1;shift +source=$1;shift +input=$1 + +if [[ $source == "html" ]] +then input=${input%%.*}_files/Mutational_Analysis/Figures/Input_NMF/Input_NMF_Count.txt +fi + +output_dir=${html%%.*}_files +mkdir $output_dir + +Rscript $SCRIPT_PATH/R/somaticSignature_Galaxy.r $parameters --cpu $cpu --input $input --output $output_dir 2>&1 + + +## Test the existence of the files and graphs produced by NMF +if [[ ! -e "$output_dir/NMF/Files/MatrixW-Normto100.txt" ]]; then + >&2 echo "error" + exit +fi + + +echo "<html><body>" >> $html +echo "<center> <h2> NMF Mutational signatures analysis </h2> </center>" >> $html + + +echo "<table>" >> $html +echo "<tr> <br/> <th><h3>Heatmap of the mixture coefficient matrix</h3></th> </tr>" >> $html +echo "<tr> <td> <center> <br/> <a href="NMF/Files/Cluster_MixtureCoeff.txt">Cluster_MixtureCoeff.txt</a> </center> </td> </tr>" >> $html +echo "<tr>" >> $html + +if [[ ! -e "$output_dir/NMF/Figures/Heatmap_MixtureCoeff.png" ]]; then + echo "WARNING: NMF package can't plot the heatmap when the samples size is above 300. <br/>" >> $html +else + echo "<td> <center> <a href="NMF/Figures/Heatmap_MixtureCoeff.png">" >> $html + echo "<img src="NMF/Figures/Heatmap_MixtureCoeff.png" /></a> <center> </td>" >> $html +fi +echo "</tr>" >> $html +echo "</table>" >> $html + +echo "<br/><br/>" >> $html + +echo "<table>" >> $html +echo "<tr>" >> $html +echo "<th><h3>Signature composition</h3></th>" >> $html +echo "</tr>" >> $html +echo "<tr><td> <center> <a href="NMF/Files/MatrixW-Normto100.txt">Composition somatic mutation (input matrix for the tool MutSpec-Compare)</a><center></td></tr>" >> $html +echo "<tr>" >> $html +echo "<td><a href="NMF/Figures/CompositionSomaticMutation.png">" >> $html +echo "<img width="1000" src="NMF/Figures/CompositionSomaticMutation.png" /></a></td>" >> $html +echo "</tr> " >> $html +echo "</table>" >> $html +echo "<br/><br/>" >> $html + +echo "<table>" >> $html +echo "<tr>" >> $html +echo "<th><h3>Sample contribution to signatures</h3></th>" >> $html +echo "</tr>" >> $html +echo "<tr><td> <center> <a href="NMF/Files/MatrixH-Inputggplot2.txt">Contribution mutation signature matrix</a></center></td></tr>" >> $html +echo "<tr>" >> $html +echo "<td><a href="NMF/Figures/ContributionMutationSignature.png">" >> $html +echo "<img width="700" src="NMF/Figures/ContributionMutationSignature.png" /></a></td>" >> $html +echo "</tr> " >> $html +echo "</table>" >> $html +echo "<br/><br/>" >> $html + + +echo "<table>" >> $html +echo "<tr>" >> $html +echo "<th><h3>Average contributions of each signatures in each cluster</h3></th>" >> $html +echo "</tr>" >> $html +echo "<tr><td> <center> <a href="NMF/Files/Average_ContriByCluster.txt">Average contributions</a></center></td></tr>" >> $html +echo "<tr>" >> $html +echo "<td><a href="NMF/Figures/Average_ContriByCluster.png">" >> $html +echo "<img width="700" src="NMF/Figures/Average_ContriByCluster.png" /></a></td>" >> $html +echo "</tr> " >> $html +echo "</table>" >> $html +echo "<br/><br/>" >> $html + +echo "<br/><br/><br/><br/>" >> $html + + + +exit 0
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecSplit.pl Tue Apr 19 03:07:11 2016 -0400 @@ -0,0 +1,64 @@ +# !/usr/bin/perl + +#-----------------------------------# +# Author: Vincent # +# Script: mutspecSplit.pl # +# Last update: 01/07/14 # +#-----------------------------------# + + +use strict; +use warnings; +use Getopt::Long; + +our $file=""; +our $column=""; +our $path=""; +our $key=""; + + +GetOptions('file|f=s' =>\$file, + 'key|k=s' =>\$key, + 'column|i=s' =>\$column, + 'path|p=s' =>\$path); + + +mkdir ("outputFiles") or die ("Erreur creation repertoire\n"); +# print $file,"\n", $key,"\n", $column,"\n", $path,"\n"; exit; + +my %tab; +if ($column==0) {$column++;} +$column--; + +open(FILE, "$file") or die "cannot open $file\n"; + +$_=<FILE>; #skip headers +chomp; +my @line = split(/\t/,$_); +my $headers = join("\t", @line[0..($column-1),($column+1)..$#line]); + +while(<FILE>){ + chomp; + my @line = split(/\t/,$_); + #if (!exists($tab{$line[$column]})) { $tab{$line[$column]}=[]; } + #push( @{ $tab{$line[$column]} }, join("\t", @line[0..($column-1),($column+1)..$#line]) ); + my $tmp = join("\t", @line[0..($column-1),($column+1)..$#line]) ; + my $id = $line[$column]; + push( @{ $tab{$id} }, $tmp); +} + + +while( my ($name,$lines) = each(%tab) ) { + my $output="outputFiles/$name"; + #my $output="primary_$key" . "_$name" . "_visible_tabular"; + # my $output=$name; + open(FILE, ">$output") or die "cannot create file $output \n"; + print FILE $headers."\n"; + foreach my $line (@{$lines}){ + print FILE "$line\n"; + } + close FILE; +} + +my $list=`ls outputFiles/*`; +print ($list);
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecSplit.xml Tue Apr 19 03:07:11 2016 -0400 @@ -0,0 +1,85 @@ +<tool id="mutSpecsplit" name="MutSpec Split" version="0.1" hidden="false" force_history_refresh="True"> +<description>Split a tabular file by sample ID</description> + +<requirements> + <requirement type="set_environment">SCRIPT_PATH</requirement> + <requirement type="package" version="5.18.1">perl</requirement> +</requirements> + +<command interpreter="perl"> + mutspecSplit.pl -f $input -c $column +</command> + +<inputs> + <param name="input" type="data" format="tabular" label="Input file" help="If using the batch mode (multiple datasets), all files must contain the same sample id column. The tool doesn't support dataset list as input !" /> + <param name="column" type="data_column" data_ref="input" label="Split by" use_header_names="true"/> +</inputs> + +<outputs> + <collection name="splitted_output" type="list" label="collection"> + <discover_datasets pattern="__name__" ext="tabular" directory="outputFiles"/> + </collection> +</outputs> + +<help> + +**What it does** + +This tool splits a file into several files based on the content of the selected column. +It can be used for example to split a file that contains data on 10 samples into 10 files using the same sample ID column. +The resulting files are saved into a dataset list/collection. + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Input** + +One or multiple tab delimited text files. + +If multiple files are selected, they should all have the same column on which you want to do the split. + +.. class:: warningmark + +The tool doesn't support dataset list as input !!! + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Output** + +A dataset list containing tab delimited text files resulting from splitting the input file(s). + +.. class:: warningmark + +If a large number of file are generated, you'll need to refresh the history to see all files included in the dataset list. The entire list of file may still not be correctly displayed due to a known bug in Galaxy that may be fixed in future versions. + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Example** + +Split by sample ID the following file:: + + Chr Start End Ref Alt Func.refGene Gene.refGene ExonicFunc.refGene AAChange.refGene genomicSuperDups 1000g2012apr_all snp137 esp6500si_all cosmic67 Strand Context Mutation_GRCh37_chromosome_number Mutation_GRCh37_genome_position Description_Ref_Genomic Description_Alt_Genomic Sample_name Pubmed_PMID Age Comments + chr12 82752552 82752552 G A exonic METTL25 nonsynonymous SNV NM_032230:c.G208A:p.E70K NA NA NA NA NA + GTCGGAGACGGAGGCCCTGCC chr12 82752552 G A APA29 23913001 2 NA + chr11 86663436 86663436 C A exonic FZD4 nonsynonymous SNV NM_012193:c.G362T:p.C121F NA NA NA NA NA - GACTGAAAGACACATGCCGCC chr11 86663436 C A APA12 21311022 34 Tissue Remark Fixed:Remark + chr12 57872994 57872994 G A exonic ARHGAP9 nonsynonymous SNV NM_001080157:c.C196T:p.R66C NA NA NA 0.000077 ID=COSM431582;OCCURENCE=2(breast) - GCTTCTAGGCGTCTTGCCAAC chr12 57872994 G A APA12 21311022 34 Tissue Remark Fixed:Remark + + +Will create a dataset list with two dataset: + + +APA29:: + + Chr Start End Ref Alt Func.refGene Gene.refGene ExonicFunc.refGene AAChange.refGene genomicSuperDups 1000g2012apr_all snp137 esp6500si_all cosmic67 Strand Context Mutation_GRCh37_chromosome_number Mutation_GRCh37_genome_position Description_Ref_Genomic Description_Alt_Genomic Sample_name Pubmed_PMID Age Comments + chr12 82752552 82752552 G A exonic METTL25 nonsynonymous SNV NM_032230:c.G208A:p.E70K NA NA NA NA NA + GTCGGAGACGGAGGCCCTGCC chr12 82752552 G A APA29 23913001 2 NA + + +APA12:: + + Chr Start End Ref Alt Func.refGene Gene.refGene ExonicFunc.refGene AAChange.refGene genomicSuperDups 1000g2012apr_all snp137 esp6500si_all cosmic67 Strand Context Mutation_GRCh37_chromosome_number Mutation_GRCh37_genome_position Description_Ref_Genomic Description_Alt_Genomic Sample_name Pubmed_PMID Age Comments + chr11 86663436 86663436 C A exonic FZD4 nonsynonymous SNV NM_012193:c.G362T:p.C121F NA NA NA NA NA - GACTGAAAGACACATGCCGCC chr11 86663436 C A APA12 21311022 34 Tissue Remark Fixed:Remark + chr12 57872994 57872994 G A exonic ARHGAP9 nonsynonymous SNV NM_001080157:c.C196T:p.R66C NA NA NA 0.000077 ID=COSM431582;OCCURENCE=2(breast) - GCTTCTAGGCGTCTTGCCAAC chr12 57872994 G A APA12 21311022 34 Tissue Remark Fixed:Remark + + + +</help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecStat.pl Tue Apr 19 03:07:11 2016 -0400 @@ -0,0 +1,3352 @@ +#!/usr/bin/env perl + +#-----------------------------------# +# Author: Maude # +# Script: mutspecStat.pl # +# Last update: 09/04/16 # +#-----------------------------------# + +use strict; +use warnings; +use Getopt::Long; +use Pod::Usage; +use File::Basename; # my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/); +use File::Path; +use Statistics::R; +use Spreadsheet::WriteExcel; + +our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested. +our ($refGenome, $output, $folder_temp, $path_R_Scripts, $path_SeqrefGenome) = ("empty", "empty", "empty", "empty", "empty"); # The reference genome to use; The path for saving the result; The path for saving the temporary files; The path to R scripts; The path to the fasta reference sequences +our ($poolData, $oneReportPerSample) = (2, 2); # If a folder is pass as input file pool all the data and generate the report on the pool and for each samples; # Generate one report for each samples + + +GetOptions('verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'refGenome=s'=>\$refGenome, 'outfile|o=s' => \$output, 'pathTemporary|temp=s' => \$folder_temp, 'pathRscript=s' => \$path_R_Scripts, 'pathSeqRefGenome=s' => \$path_SeqrefGenome, 'poolData' => \$poolData, 'reportSample' => \$oneReportPerSample) or pod2usage(2); + +our ($input) = @ARGV; + +pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help); +pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man); +pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 0); # No argument is pass to the command line print the usage of the script +pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 2); # Only one argument is expected to be pass to @ARGV (the input) + + + +###################################################################################################################################################### +# GLOBAL VARIABLES # +###################################################################################################################################################### +# Recover the current path +our $pwd = `pwd`; +chomp($pwd); + +# Path to R scripts +our $pathRScriptTxnSB = "$path_R_Scripts/R/transciptionalStrandBias.r"; +our $pathRScriptMutSpectrum = "$path_R_Scripts/R/mutationSpectra_Galaxy.r"; + +our $folderMutAnalysis = ""; +our @pathInput = split("/", $input); + +# Hash table with the length of each chromosomes +our %chromosomes; + +###################################################################################################################################################### +# MAIN # +###################################################################################################################################################### +# Check the presence of the flags and create the output and temp directories +CheckFlags(); + +# Retrieve chromosomes length +checkChrDir(); + + +print "-----------------------------------------------------------------\n"; +print "-----------------Report Mutational Analysis----------------------\n"; +print"-----------------------------------------------------------------\n"; + +# First check if the file is annotated or not +CheckAnnotationFile($input); + +# Calculate the statistics and generate the report +my @colInfoAV = qw(Chr Start Ref Alt); +ReportMutDist($input, $folderMutAnalysis, $folder_temp, \@colInfoAV, $refGenome); + +# Remove the temporary directory +rmtree($folder_temp); + + +###################################################################################################################################################### +# FUNCTIONS # +###################################################################################################################################################### + +# Check the presence of the flags and create the output and temp directories +sub CheckFlags +{ + # Check the reference genome + if($refGenome eq "empty") { print STDERR "You forget to specify the name for the reference genome!!!\nPlease specify it with the flag --refGenome\n"; exit; } + + # If no output is specified write the result as the same place as the input file + if($output eq "empty") + { + my $folderRes = ""; + for(my $i=0; $i<$#pathInput; $i++) { $folderRes .= "$pathInput[$i]/"; } + + $folderMutAnalysis = "$folderRes/Mutational_Analysis"; + if(!-e $folderMutAnalysis) { mkdir($folderMutAnalysis) or die "$!: $folderMutAnalysis\n"; } + } + else + { + if(!-e $output) { mkdir($output) or die "$!: $output\n"; } + + $folderMutAnalysis = "$output/Mutational_Analysis"; + if(!-e $folderMutAnalysis) { mkdir($folderMutAnalysis) or die "$!: $folderMutAnalysis\n"; } + } + + # If no temp folder is specified write the result in the current path + if($folder_temp eq "empty") { $folder_temp = "$pwd/TEMP_MutationalAnalysis_$pathInput[$#pathInput]"; } + if(!-e $folder_temp) { mkdir($folder_temp) or die "$!: $folder_temp\n"; } + + # Check the path to the R scripts + if($path_R_Scripts eq "empty") { print STDERR "You forget to specify the path for the R scripts!!!\nPlease specify it with the flag --pathRscript\n"; exit; } + + + # The input is a folder + if(-d $input) { foreach my $file (`ls $input`) { CheckLengthFilename("$input/$file"); } } + # The input is one file + else { CheckLengthFilename($input); } +} +# Check the length of the file, must be < 32 characters for the Excel sheet +sub CheckLengthFilename +{ + my ($inputFile) = @_; + + ## Verify the name of file, must be <= 31 chars for the sheet name + my ($filename, $directories, $suffix) = fileparse($inputFile, qr/\.[^.]*/); + + if(length($filename) > 31) { print STDERR "The file: $inputFile must be <= 31 chars\nPlease modify it before running the script\n"; exit; } +} + +# Retrieve chromosomes length +sub checkChrDir +{ + my @files = `ls $path_SeqrefGenome/$refGenome"_seq"/*.fa`; + foreach my $file (@files) + { + if ($file !~ /chr(\d+|x|y)\.fa/i){next;} + open(FILE,$file); + <FILE>; + my $seq=""; + while (<FILE>){ chomp; $seq.=$_;} + $file =~ /chr(.*)\.fa/; + $chromosomes{"chr".$1}=length($seq); + } +} + +# Check if the file is annotated or not +sub CheckAnnotationFile +{ + my ($inputFile) = @_; + + # A folder is pass in argument + if(-d $inputFile) + { + foreach my $file (`ls $inputFile`) + { + chomp($file); + + open(F1, "$inputFile/$file") or die "$!: $inputFile/$file\n"; + my $search_header = <F1>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header); + close F1; + # The number of the column + my $value_of_column_NB = "toto"; + for(my $i=0; $i<=$#tab_search_header; $i++) + { + if($tab_search_header[$i] eq "Func.refGene") { $value_of_column_NB = $i; } + } + if($value_of_column_NB eq "toto") { print STDERR "Error the input file you specify is not annotated! $inputFile/$file !!!!\nPlease first annotate your file before trying to generate the report on mutation patterns\n"; exit; } + } + } + else + { + open(F1, $inputFile) or die "$!: $inputFile\n"; + my $search_header = <F1>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header); + close F1; + # The number of the column + my $value_of_column_NB = "toto"; + for(my $i=0; $i<=$#tab_search_header; $i++) + { + if($tab_search_header[$i] eq "Func.refGene") { $value_of_column_NB = $i; } + } + if($value_of_column_NB eq "toto") { print STDERR "Error the input file you specify is not annotated! $inputFile !!!!\nPlease first annotate your file before trying to generate the report on mutation patterns\n"; exit; } + } +} + +# Calculate the statistics and generate the report +sub ReportMutDist +{ + our ($input, $output, $folder_temp, $refTab_colInfo, $refGenome) = @_; + + my @column = @$refTab_colInfo; + + our ($chr_name, $start_name, $ref_name, $alt_name) = split(/,/, join(',', @column)); # Separe each element + + our $func_name = "Func.refGene"; + our $exonicFunc_name = "ExonicFunc.refGene"; + our $strand_name = "Strand"; + our $context_name = "context"; + + my $folderFigure = "$output/Figures"; + if(-e $folderFigure) { rmtree($folderFigure); mkdir($folderFigure) or die "Can't create the directory $folderFigure\n"; } + else { mkdir($folderFigure) or die "Can't create the directory $folderFigure\n"; } + my $folderChi2 = "$folderFigure/Chi2"; + if(!-e $folderChi2) { mkdir($folderChi2) or die "Can't create the directory $folderChi2\n"; } + my $folderWebLogo = "$folderFigure/WebLogo"; + if(!-e $folderWebLogo) { mkdir($folderWebLogo) or die "Can't create the directory $folderWebLogo\n"; } + my $folderNMF = "$folderFigure/Input_NMF"; + if(!-e $folderNMF) { mkdir($folderNMF) or die "Can't create the directory $folderNMF\n"; } + + ################################################################################################ + ### Calculates all the statistics ### + ################################################################################################ + ############ Recover Annovar annotations (for having the save number of functional regions for each samples) + my @tab_func = recoverAnnovarAnnotation($input, $func_name); + if(@tab_func == 0) { print STDERR "Error the table for the functional region is empty!!!!! check $input $func_name\n"; exit; } + + ############ Calculate the different statistics present in the report + my %h_file = (); + CalculateStatistics(\%h_file, \@tab_func); + + ############ Calculate the chi2 for the strand bias + CalculateChi2(\%h_file, $folderChi2); + + ############ Write the different statistics present in the report + WriteStatistics(\%h_file, $#tab_func, $folderFigure, $folderChi2, $folderNMF); + + ############ Create logo for studying the 10 flanking bases of the mutation + CreateLogo(\%h_file, $folderWebLogo); + + + ################### Subroutines for generating the report for the mutational analysis + sub recoverAnnovarAnnotation + { + my ($input, $AV_annotation) = @_; + + my %hash = (); + + # The input is a folder + if(-d $input) + { + foreach my $file (`ls $input`) + { + $file =~ s/[\r\n]+$//; + my $AV_annotation_value = recoverNumCol("$input/$file", $AV_annotation); + + open(F1, "$input/$file") or die "$!: $input/$file\n"; + my $header = <F1>; + while(<F1>) + { + $_ =~ s/[\r\n]+$//; + my @tab = split("\t", $_); + + # Some files can have an empty line at the end and WE DON'T WANT to consider it + if(! defined $tab[0]) { next; } + # Some func value are repeated and separated by ";" + my $funcSegment = ""; + if($tab[$AV_annotation_value] =~ /;/) { my @temp = split(";", $tab[$AV_annotation_value]); $funcSegment = $temp[0]; } + else { $funcSegment = $tab[$AV_annotation_value]; } + + $hash{$funcSegment} = ""; + } + close F1; + } + my @tab_AVAnnotation = (); + foreach my $k (sort keys %hash) { push(@tab_AVAnnotation, $k); } + return @tab_AVAnnotation; + } + # The input is a file + else + { + my $AV_annotation_value = recoverNumCol($input, $AV_annotation); + + open(F1, $input) or die "$!: $input\n"; + my $header = <F1>; + while(<F1>) + { + $_ =~ s/[\r\n]+$//; + my @tab = split("\t", $_); + + # Some func value are repeated and separated by ";" + my $funcSegment = ""; + if($tab[$AV_annotation_value] =~ /;/) { my @temp = split(";", $tab[$AV_annotation_value]); $funcSegment = $temp[0]; } + else { $funcSegment = $tab[$AV_annotation_value]; } + + $hash{$funcSegment} = ""; + } + close F1; + my @tab_AVAnnotation = (); + foreach my$k (sort keys %hash) { push(@tab_AVAnnotation, $k); } + return @tab_AVAnnotation; + } + } + # Calculate the different statistics present in the report + sub CalculateStatistics + { + my ($refH_file, $refT_func) = @_; + + my ($chr_value, $start_value, $ref_value, $alt_value, $func_value, $exonicFunc_value, $strand_value, $contextSeq_value) = ("", "", "", "", "", "", "", "", "", ""); + + # If the input is a folder + if(-d $input) + { + my $folderPool = "$folder_temp/Pool"; + if(!-e $folderPool) { mkdir($folderPool) or die "Can't create the directory $folderPool\n"; } + + # Copy each sample + foreach my $file (`ls $input`) { chomp($file); system("cp $input/$file $folderPool"); } + + # Generate the pool of all the data + if($poolData == 1) + { + my @listFile = `ls $input`; + + # For keeping the header only one time + chomp($listFile[0]); + system("cp $input/$listFile[0] $folderPool/Pool_Data.txt"); + + open(OUT, ">>", "$folderPool/Pool_Data.txt") or die "$!: $folderPool/Pool_Data.txt\n"; + + for(my $i=1; $i<=$#listFile; $i++) + { + chomp($listFile[$i]); + open(F1, "$input/$listFile[$i]") or die "$!: $input/$listFile[$i]\n"; + my $header = <F1>; + while(<F1>) { print OUT $_; } + close F1; + } + close OUT; + } + + foreach my $file (`ls $folderPool`) + { + chomp($file); + ############ Recover the number of the columns of interest + $chr_value = recoverNumCol("$folderPool/$file", $chr_name); + $start_value = recoverNumCol("$folderPool/$file", $start_name); + $ref_value = recoverNumCol("$folderPool/$file", $ref_name); + $alt_value = recoverNumCol("$folderPool/$file", $alt_name); + $func_value = recoverNumCol("$folderPool/$file", $func_name); + $exonicFunc_value = recoverNumCol("$folderPool/$file", $exonicFunc_name); + $strand_value = recoverNumCol("$folderPool/$file", $strand_name); + $contextSeq_value = recoverNumCol("$folderPool/$file", $context_name); + ############ Recover the number of the columns of interest + + ############ Control the annotated file pass in argument + ## Check if the files have variants + my $nbLines_originalFile = `wc -l $folderPool/$file`; $nbLines_originalFile =~ /(\d+) /; + if($1==1) { print STDERR "\n\nNo line in the file $folderPool/$file\n\n"; exit; } + ## Check if there is variant with strand information. If not the rest of the script generates errors + my $testFile = 0; + CheckVariantReport("$folderPool/$file", $strand_value, \$testFile); + if($testFile==0) { print STDERR "\n\nNo strand information for the file $folderPool/$file\n\n"; exit; } + ############ Control the annotated file pass in argument + + ############ Calculate the statistics + File2Hash("$folderPool/$file", $func_value, $exonicFunc_value, $chr_value, $ref_value, $alt_value, $strand_value, $contextSeq_value, $refH_file, $refT_func); + } + } + # If the input is a file + else + { + ############ Recover the number of the columns of interest + $chr_value = recoverNumCol($input, $chr_name); + $start_value = recoverNumCol($input, $start_name); + $ref_value = recoverNumCol($input, $ref_name); + $alt_value = recoverNumCol($input, $alt_name); + $func_value = recoverNumCol($input, $func_name); + $exonicFunc_value = recoverNumCol($input, $exonicFunc_name); + $strand_value = recoverNumCol($input, $strand_name); + $contextSeq_value = recoverNumCol($input, $context_name); + ############ Recover the number of the columns of interest + + ############ Control the annotated file pass in argument + ## Check if the files have variants + my $nbLines_originalFile = `wc -l $input`; $nbLines_originalFile =~ /(\d+) /; + if($1==1) { print STDERR "\n\nNo line in the file $input\n\n"; exit; } + ## Check if there is variant with strand information. If not the rest of the script generates errors + my $testFile = 0; + CheckVariantReport($input, $strand_value, \$testFile); + if($testFile==0) { print STDERR "\n\nNo strand information for the file $input\n\n"; exit; } + ############ Control the annotated file pass in argument + + ############ Calculate the statistics + File2Hash($input, $func_value, $exonicFunc_value, $chr_value, $ref_value, $alt_value, $strand_value, $contextSeq_value, $refH_file, $refT_func); + } + } + # Check if there is at least one variant with a strand information + sub CheckVariantReport + { + my ($file, $strand_value, $refS_testFile) = @_; + + open(F1, $file) or die "$!: $file\n"; + my $header = <F1>; + while(<F1>) + { + $_ =~ s/[\r\n]+$//; + my @tab = split("\t", $_); + + if( ($tab[$strand_value] eq "+") || ($tab[$strand_value] eq "-") ) { $$refS_testFile++; } + } + close F1; + } + # Convert the annotated VCF into a hash table + sub File2Hash + { + my ($inputFile, $func_value, $exonicFunc_value, $chr_value, $ref_value, $alt_value, $strand_value, $contextSeq_value, $refH_file, $refT_func) = @_; + my ($filename, $directories, $suffix) = fileparse($inputFile, qr/\.[^.]*/); + + # Initialisation of the hash + my @tab_mutation = qw(C:G>A:T C:G>G:C C:G>T:A T:A>A:T T:A>C:G T:A>G:C); + my @tab_aaChange = ("NonTr", "Tr", "TotalMutG"); + my @tabExoFunc = ("frameshift insertion", "frameshift deletion", "frameshift block substitution", "frameshift substitution", "stopgain", "stoploss", "nonframeshift insertion", "nonframeshift deletion", "nonframeshift substitution", "nonframeshift block substitution", "nonsynonymous SNV", "synonymous SNV", "unknown", "NA"); + + # Total number of SBS on the genomic strand + $refH_file->{$filename}{'TotalSBSGenomic'} = 0; + # Total number of Indel on the genomic strand + $refH_file->{$filename}{'TotalIndelGenomic'} = 0; + # Total number of SBS on the coding strand + $refH_file->{$filename}{'TotalSBSCoding'} = 0; + # Total number of SBS and Indel on the genomic strand + $refH_file->{$filename}{'TotalMutGenomic'} = 0; + + ##################################### + # SBS by segment (6 mutation types) # + ##################################### + foreach my $elt_tabFunc (@$refT_func) + { + foreach my $elt_tabMutation (@tab_mutation) + { + foreach my $elt_aaChange (@tab_aaChange) + { + $refH_file->{$filename}{'6mutType'}{$elt_tabFunc}{$elt_tabMutation}{$elt_aaChange} = 0; + } + } + } + + ####################### + # Pearson correlation # + ####################### + $refH_file->{$filename}{'SBSPerChr'}{'AllMutType'} = 0; + # Count of SBS per chromosome foreach mutation types + foreach my $elt_tabMutation (@tab_mutation) + { + foreach my $chromosome (sort keys %chromosomes){ $refH_file->{$filename}{'SBSPerChr'}{$elt_tabMutation}{'CHR'}{$chromosome}{'chr'} = 0;} + $refH_file->{$filename}{'SBSPerChr'}{$elt_tabMutation}{'Pearson'} = 0; + } + + foreach my $chromosome (sort keys %chromosomes){ + $refH_file->{$filename}{'SBSPerChr'}{'TotalPerChr'}{$chromosome}{'chr'}=0; + } + + ############################ + # Impact of SBS on protein # + ############################ + foreach my $elt_exoFunc (@tabExoFunc) + { + $refH_file->{$filename}{'ImpactSBS'}{$elt_exoFunc} = 0; + } + + ##################################### + # Sequence context (genomic strand) # + ##################################### + my @tab_mutation2 = qw(C>A C>G C>T T>A T>C T>G); + my @tab_context = qw(A_A A_C A_G A_T C_A C_C C_G C_T G_A G_C G_G G_T T_A T_C T_G T_T); + foreach my $elt_context (@tab_context) + { + foreach my $elt_mutation3 (@tab_mutation2) + { + $refH_file->{$filename}{'SeqContextG'}{$elt_context}{$elt_mutation3} = 0; + } + } + + #################################### + # Sequence context (coding strand) # + #################################### + my @tab_TrNonTr = qw(NonTr Tr); + foreach my $elt_context (@tab_context) + { + foreach my $elt_mutation2 (@tab_mutation2) + { + foreach my $trNonTr (@tab_TrNonTr) + { + $refH_file->{$filename}{'SeqContextC'}{$elt_context}{$elt_mutation2}{$trNonTr} = 0; + } + } + } + + open(F1,$inputFile) or die "$!: $inputFile\n"; + my $header = <F1>; + while(<F1>) + { + $_ =~ s/[\r\n]+$//; + my @tab = split("\t", $_); + + # Random chromosome and chromosome M + if( ($tab[$chr_value] =~ /random/i) || ($tab[$chr_value] =~ /M/i) ) { next; } + + ############################################## Extract the base just before and after the mutation ############################################## + my $context = ""; + my $contextSequence = $tab[$contextSeq_value]; $contextSequence =~ tr/a-z/A-Z/; + my @tempContextSequence = split("", $contextSequence); + my $total_nbBaseContext = $#tempContextSequence; + my $midlle_totalNbBaseContext = $total_nbBaseContext/2; # For having the middle of the sequence + my $before = $midlle_totalNbBaseContext - 1; my $after = $midlle_totalNbBaseContext + 1; + $context = $tempContextSequence[$before]."_".$tempContextSequence[$after]; + ############################################## Extract the base just before and after the mutation ############################################## + + + ############################################################### Impact on protein ############################################################### + my $exoFunc = ""; + # Sometimes the annotation is repeated frameshift deletion;frameshift deletion + if($tab[$exonicFunc_value] =~ /\;/) + { + my @temp = split(";", $tab[$exonicFunc_value]); + if($temp[0] eq $temp[1]) { $exoFunc = $temp[0]; } + } + # The annotations have changed after MAJ Annovar 2014Jul22 (stopgain SNV => stopgain) + elsif($tab[$exonicFunc_value] eq "stopgain SNV") { $exoFunc = "stopgain"; } + elsif($tab[$exonicFunc_value] eq "stoploss SNV") { $exoFunc = "stoploss"; } + elsif($tab[$exonicFunc_value] eq "nonsynonymous_SNV") { $exoFunc = "nonsynonymous SNV"; } + elsif($tab[$exonicFunc_value] eq "stopgain_SNV") { $exoFunc = "stopgain SNV"; } + elsif($tab[$exonicFunc_value] eq "synonymous_SNV") { $exoFunc = "synonymous SNV"; } + else { $exoFunc = $tab[$exonicFunc_value]; } + + if(exists $refH_file->{$filename}{'ImpactSBS'}{$exoFunc}) + { + # If the sequence context if not recovered correctly don't considered the variants + if( ($context =~ /N/) || (length($context) != 3) ) { next; } + + $refH_file->{$filename}{'ImpactSBS'}{$exoFunc}++; + $refH_file->{$filename}{'TotalMutGenomic'}++; + } + else { print "WARNING: Exonic function not considered: $exoFunc\n"; } + ############################################################### Impact on protein ############################################################### + + ################################################### Only SBS are considered for the statistics ################################################## + if( ($tab[$ref_value] =~ /^[ACGT]$/i) && ($tab[$alt_value] =~ /^[ACGT]$/i) ) + { + # If the sequence context if not recovered correctly don't considered the variants + if( ($context =~ /N/) || (length($context) != 3) ) { next; } + + # Total number of SBS on the genomic strand + $refH_file->{$filename}{'TotalSBSGenomic'}++; + + # Total number of SBS on the coding strand with a sequence context + if( ($tab[$strand_value] eq "+") || ($tab[$strand_value] eq "-") ) + { + if( ($context ne "NA") && (($context =~ /N/) || (length($context) != 3)) ) { next; } + $refH_file->{$filename}{'TotalSBSCoding'}++; + } + } + else { $refH_file->{$filename}{'TotalIndelGenomic'}++; } + ################################################### Only SBS are considered for the statistics ################################################## + + # Number of SBS per chromosome: remove the "chr" + my $chrNameForH=$tab[$chr_value]; + if(exists $refH_file->{$filename}{'SBSPerChr'}{'TotalPerChr'}{$chrNameForH}{'chr'}) { $refH_file->{$filename}{'SBSPerChr'}{'TotalPerChr'}{$chrNameForH}{'chr'}++; } + + + ################################################### Some func value are repeated and separated by ";" ################################################## + my $funcSegment = ""; + if($tab[$func_value] =~ /;/) { my @temp = split(";", $tab[$func_value]); $funcSegment = $temp[0]; } + else { $funcSegment = $tab[$func_value]; } + + + ############################################################### MUTATION C> ############################################# + ###################################### C:G>A:T + if( (($tab[$ref_value] eq "C") && ($tab[$alt_value] eq "A")) || ( ($tab[$ref_value] eq "G") && ($tab[$alt_value] eq "T") ) ) + { + my $mutation = "C:G>A:T"; + $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'TotalMutG'}++; # Count the total number of mutations + + # Pearson correlation + if(exists $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'}) { $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'}++; } + + # Sequence context - 6 mutation types - genomic strand + my $mutationSeqContext6mutType = "C>A"; + # We want to express the mutation in C> + if( ($tab[$ref_value] eq "G") && ($tab[$alt_value] eq "T") ) + { + my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]); + my $context_reverse = $base5."_".$base3; + if(exists $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType}) { $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType}++; } + } + elsif(exists $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType}) { $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType}++; } + + # Strand analysis C>A on NonTr strand + if( (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "C")&&($tab[$alt_value] eq "A"))) || (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "G")&&($tab[$alt_value] eq "T"))) ) + { + if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'}) { $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'}++; } + + # C>A With the sequence context (C>A strand = +) + if( ($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "C")&&($tab[$alt_value] eq "A")) ) + { + if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{'C>A'}{'NonTr'}) { $refH_file->{$filename}{'SeqContextC'}{$context}{'C>A'}{'NonTr'}++; } + } + # C>A With the sequence context (G>T strand = -) + else + { + my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]); + my $context_reverse = $base5."_".$base3; + if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'C>A'}{'NonTr'}) { $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'C>A'}{'NonTr'}++; } + } + } + # Strand analysis C>A on Tr strand + if( (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "C")&&($tab[$alt_value] eq "A"))) || (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "G")&&($tab[$alt_value] eq "T"))) ) + { + if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'}) { $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'}++; } + + # C>A With the sequence context (C>A strand = -) + if( ($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "C")&&($tab[$alt_value] eq "A")) ) + { + if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{'C>A'}{'Tr'}) { { $refH_file->{$filename}{'SeqContextC'}{$context}{'C>A'}{'Tr'}++; } } + } + # C>A with the sequence context (G>T strand = +) + if( ($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "G")&&($tab[$alt_value] eq "T")) ) + { + my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]); + my $context_reverse = $base5."_".$base3; + if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'C>A'}{'Tr'}) { $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'C>A'}{'Tr'}++; } + } + } + # WebLogo-3 + if(($tab[$ref_value] eq "C") && ($tab[$alt_value] eq "A")) + { + # For the logo all the sequences must have the same length + if(scalar(@tempContextSequence) == 2) { next; } + my ($contextTemp1, $contextTemp2) = ("", ""); + for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= $tempContextSequence[$i]; } + for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= $tempContextSequence[$i]; } + my $context = $contextTemp1."C".$contextTemp2; + push(@{$refH_file->{$filename}{'WebLogo3'}{'CA'}}, $context); + } + else + { + if(scalar(@tempContextSequence) == 2) { next; } + my ($contextTemp1, $contextTemp2) = ("", ""); + for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= complement($tempContextSequence[$i]); } + for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= complement($tempContextSequence[$i]); } + my $context = $contextTemp1."C".$contextTemp2; $context = reverse $context; + push(@{$refH_file->{$filename}{'WebLogo3'}{'CA'}}, $context); + } + } + ###################################### C:G>G:C + if( (($tab[$ref_value] eq "C") && ($tab[$alt_value] eq "G")) || ( ($tab[$ref_value] eq "G") && ($tab[$alt_value] eq "C") ) ) + { + my $mutation = "C:G>G:C"; + $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'TotalMutG'}++; # Count the total number of mutations + + # Pearson correlation + if(exists $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'}) { $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'}++; } + + # Sequence context - 6 mutation types - genomic strand + my $mutationSeqContext6mutType = "C>G"; + # We want to express the mutation in C> + if( ($tab[$ref_value] eq "G") && ($tab[$alt_value] eq "C") ) + { + my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]); + my $context_reverse = $base5."_".$base3; + if(exists $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType}) { $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType}++; } + } + elsif(exists $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType}) { $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType}++; } + + # Strand analysis C>G on NonTr strand + if( (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "C")&&($tab[$alt_value] eq "G"))) || (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "G")&&($tab[$alt_value] eq "C"))) ) + { + if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'}) { $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'}++; } + + # C>G with the sequence context (C>G strand = +) + if( ($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "C")&&($tab[$alt_value] eq "G")) ) + { + if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{'C>G'}{'NonTr'}) { $refH_file->{$filename}{'SeqContextC'}{$context}{'C>G'}{'NonTr'}++; } + } + # C>G with the sequence context (G>C strand = -) + if( ($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "G")&&($tab[$alt_value] eq "C")) ) + { + my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]); + my $context_reverse = $base5."_".$base3; + if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'C>G'}{'NonTr'}) { $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'C>G'}{'NonTr'}++; } + } + } + # Strand analysis C>G on Tr strand + if( (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "C")&&($tab[$alt_value] eq "G"))) || (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "G")&&($tab[$alt_value] eq "C"))) ) + { + if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'}) { $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'}++; } + + # C>G with the sequence context (C>G strand = -) + if( ($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "C")&&($tab[$alt_value] eq "G")) ) + { + if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{'C>G'}{'Tr'}) { $refH_file->{$filename}{'SeqContextC'}{$context}{'C>G'}{'Tr'}++; } + } + # C>G with the sequence context (G>C strand = +) + if( ($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "G")&&($tab[$alt_value] eq "C")) ) + { + my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]); + my $context_reverse = $base5."_".$base3; + if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'C>G'}{'Tr'}) { $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'C>G'}{'Tr'}++; } + } + } + # WebLogo-3 + if(($tab[$ref_value] eq "C") && ($tab[$alt_value] eq "G")) + { + if(scalar(@tempContextSequence) == 2) { next; } + my ($contextTemp1, $contextTemp2) = ("", ""); + for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= $tempContextSequence[$i]; } + for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= $tempContextSequence[$i]; } + my $context = $contextTemp1."C".$contextTemp2; + push(@{$refH_file->{$filename}{'WebLogo3'}{'CG'}}, $context); + } + else + { + if(scalar(@tempContextSequence) == 2) { next; } + my ($contextTemp1, $contextTemp2) = ("", ""); + for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= complement($tempContextSequence[$i]); } + for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= complement($tempContextSequence[$i]); } + my $context = $contextTemp1."C".$contextTemp2; $context = reverse $context; + push(@{$refH_file->{$filename}{'WebLogo3'}{'CG'}}, $context); + } + } + ###################################### C:G>T:A + if( (($tab[$ref_value] eq "C") && ($tab[$alt_value] eq "T")) || ( ($tab[$ref_value] eq "G") && ($tab[$alt_value] eq "A") ) ) + { + my $mutation = "C:G>T:A"; + $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'TotalMutG'}++; # Count the total number of mutations + + # Pearson correlation + if(exists $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'}) { $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'}++; } + + # Sequence context - 6 mutation types - genomic strand + my $mutationSeqContext6mutType = "C>T"; + # We want to express the mutation in C> + if( ($tab[$ref_value] eq "G") && ($tab[$alt_value] eq "A") ) + { + my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]); + my $context_reverse = $base5."_".$base3; + if(exists $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType}) { $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType}++; } + } + elsif(exists $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType}) { $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType}++; } + + # Strand analysis C>T on NonTr strand + if( (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "C")&&($tab[$alt_value] eq "T"))) || (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "G")&&($tab[$alt_value] eq "A"))) ) + { + if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'}) { $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'}++; } + + # C>T with the sequence context (C>T strand = +) + if( ($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "C")&&($tab[$alt_value] eq "T")) ) + { + if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{'C>T'}{'NonTr'}) { $refH_file->{$filename}{'SeqContextC'}{$context}{'C>T'}{'NonTr'}++; } + } + # C>T with the sequence context (G>A strand = -) + if( ($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "G")&&($tab[$alt_value] eq "A")) ) + { + my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]); + my $context_reverse = $base5."_".$base3; + if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'C>T'}{'NonTr'}) { $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'C>T'}{'NonTr'}++; } + } + } + # Strand analysis C>T on Tr strand + if( (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "C")&&($tab[$alt_value] eq "T"))) || (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "G")&&($tab[$alt_value] eq "A"))) ) + { + if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'}) { $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'}++; } + + # C>T with the sequence context (C>T strand = -) + if( ($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "C")&&($tab[$alt_value] eq "T")) ) + { + if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{'C>T'}{'Tr'}) { $refH_file->{$filename}{'SeqContextC'}{$context}{'C>T'}{'Tr'}++; } + } + # C>T with the sequence context (G>A strand = +) + if( ($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "G")&&($tab[$alt_value] eq "A")) ) + { + my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]); + my $context_reverse = $base5."_".$base3; + if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'C>T'}{'Tr'}) { $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'C>T'}{'Tr'}++; } + } + } + # WebLogo-3 + if(($tab[$ref_value] eq "C") && ($tab[$alt_value] eq "T")) + { + if(scalar(@tempContextSequence) == 2) { next; } + my ($contextTemp1, $contextTemp2) = ("", ""); + for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= $tempContextSequence[$i]; } + for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= $tempContextSequence[$i]; } + my $context = $contextTemp1."C".$contextTemp2; + push(@{$refH_file->{$filename}{'WebLogo3'}{'CT'}}, $context); + } + else + { + if(scalar(@tempContextSequence) == 2) { next; } + my ($contextTemp1, $contextTemp2) = ("", ""); + for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= complement($tempContextSequence[$i]); } + for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= complement($tempContextSequence[$i]); } + my $context = $contextTemp1."C".$contextTemp2; $context = reverse $context; + push(@{$refH_file->{$filename}{'WebLogo3'}{'CT'}}, $context); + } + } + + ############################################################### MUTATION T> ############################################# + ###################################### T:A>A:T + if( (($tab[$ref_value] eq "T") && ($tab[$alt_value] eq "A")) || ( ($tab[$ref_value] eq "A") && ($tab[$alt_value] eq "T") ) ) + { + my $mutation = "T:A>A:T"; + $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'TotalMutG'}++; # Count the total number of mutations + + # Pearson correlation + if(exists $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'}) { $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'}++; } + + # Sequence context - 6 mutation types - genomic strand + my $mutationSeqContext6mutType = "T>A"; + # We want to express the mutation in T> + if( ($tab[$ref_value] eq "A") && ($tab[$alt_value] eq "T") ) + { + my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]); + my $context_reverse = $base5."_".$base3; + if(exists $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType}) { $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType}++; } + } + elsif(exists $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType}) { $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType}++; } + + # Strand analysis T>A on NonTr stand + if( (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "T")&&($tab[$alt_value] eq "A"))) || (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "A")&&($tab[$alt_value] eq "T"))) ) + { + if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'}) { $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'}++; } + + # T>A with the sequence context (T>A strand = +) + if( ($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "T")&&($tab[$alt_value] eq "A")) ) + { + if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{'T>A'}{'NonTr'}) { $refH_file->{$filename}{'SeqContextC'}{$context}{'T>A'}{'NonTr'}++; } + } + # T>A with the sequence context (A>T strand = -) + else + { + my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]); + my $context_reverse = $base5."_".$base3; + if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'T>A'}{'NonTr'}) { $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'T>A'}{'NonTr'}++; } + } + } + # Strand analysis T>A on Tr strand + if( (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "T")&&($tab[$alt_value] eq "A"))) || (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "A")&&($tab[$alt_value] eq "T"))) ) + { + if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'}) { $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'}++; } + + # T>A <ith the sequence context (T>A strand = -) + if( ($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "T")&&($tab[$alt_value] eq "A")) ) + { + if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{'T>A'}{'Tr'}) { $refH_file->{$filename}{'SeqContextC'}{$context}{'T>A'}{'Tr'}++; } + } + # T>A with the sequence context (A>T strand = +) + else + { + my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]); + my $context_reverse = $base5."_".$base3; + if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'T>A'}{'Tr'}) { $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'T>A'}{'Tr'}++; } + } + } + # WebLogo-3 + if(($tab[$ref_value] eq "T") && ($tab[$alt_value] eq "A")) + { + if(scalar(@tempContextSequence) == 2) { next; } + my ($contextTemp1, $contextTemp2) = ("", ""); + for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= $tempContextSequence[$i]; } + for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= $tempContextSequence[$i]; } + my $context = $contextTemp1."T".$contextTemp2; + push(@{$refH_file->{$filename}{'WebLogo3'}{'TA'}}, $context); + } + else + { + if(scalar(@tempContextSequence) == 2) { next; } + my ($contextTemp1, $contextTemp2) = ("", ""); + for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= complement($tempContextSequence[$i]); } + for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= complement($tempContextSequence[$i]); } + my $context = $contextTemp1."T".$contextTemp2; $context = reverse $context; + push(@{$refH_file->{$filename}{'WebLogo3'}{'TA'}}, $context); + } + } + ###################################### T:A>C:G + if( (($tab[$ref_value] eq "T") && ($tab[$alt_value] eq "C")) || ( ($tab[$ref_value] eq "A") && ($tab[$alt_value] eq "G")) ) + { + my $mutation = "T:A>C:G"; + $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'TotalMutG'}++; # Count the total number of mutations + + # Pearson correlation + if(exists $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'}) { $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'}++; } + + # Sequence context - 6 mutation types - genomic strand + my $mutationSeqContext6mutType = "T>C"; + # We want to express the mutation in T> + if( ($tab[$ref_value] eq "A") && ($tab[$alt_value] eq "T") ) + { + my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]); + my $context_reverse = $base5."_".$base3; + if(exists $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType}) { $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType}++; } + } + elsif(exists $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType}) { $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType}++; } + + # Strand analysis T>C on NonTr strand + if( (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "T")&&($tab[$alt_value] eq "C"))) || (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "A")&&($tab[$alt_value] eq "G"))) ) + { + if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'}) { $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'}++; } + + # T>C (T>C strand = +) + if( ($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "T")&&($tab[$alt_value] eq "C")) ) + { + if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{'T>C'}{'NonTr'}) { $refH_file->{$filename}{'SeqContextC'}{$context}{'T>C'}{'NonTr'}++; } + } + # T>C (A>G strand = -) + else + { + my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]); + my $context_reverse = $base5."_".$base3; + if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'T>C'}{'NonTr'}) { $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'T>C'}{'NonTr'}++; } + } + } + # Strand analysis T>C on Tr strand + if( (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "T")&&($tab[$alt_value] eq "C"))) || (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "A")&&($tab[$alt_value] eq "G"))) ) + { + if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'}) { $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'}++; } + + # T>C (T>C strand = -) + if( ($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "T")&&($tab[$alt_value] eq "C")) ) + { + if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{'T>C'}{'Tr'}) { $refH_file->{$filename}{'SeqContextC'}{$context}{'T>C'}{'Tr'}++; } + } + # T>C (A>G strand = +) + else + { + my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]); + my $context_reverse = $base5."_".$base3; + if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'T>C'}{'Tr'}) { $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'T>C'}{'Tr'}++; } + } + } + # WebLogo-3 + if(($tab[$ref_value] eq "T") && ($tab[$alt_value] eq "C")) + { + if(scalar(@tempContextSequence) == 2) { next; } + my ($contextTemp1, $contextTemp2) = ("", ""); + for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= $tempContextSequence[$i]; } + for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= $tempContextSequence[$i]; } + my $context = $contextTemp1."T".$contextTemp2; $context = reverse $context; + push(@{$refH_file->{$filename}{'WebLogo3'}{'TC'}}, $context); + } + else + { + if(scalar(@tempContextSequence) == 2) { next; } + my ($contextTemp1, $contextTemp2) = ("", ""); + for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= complement($tempContextSequence[$i]); } + for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= complement($tempContextSequence[$i]); } + my $context = $contextTemp1."T".$contextTemp2; + push(@{$refH_file->{$filename}{'WebLogo3'}{'TC'}}, $context); + } + } + ###################################### T:A>G:C + if( (($tab[$ref_value] eq "T") && ($tab[$alt_value] eq "G")) || ( ($tab[$ref_value] eq "A") && ($tab[$alt_value] eq "C")) ) + { + my $mutation = "T:A>G:C"; + $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'TotalMutG'}++; # Count the total number of mutations + + # Pearson correlation + if(exists $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'}) { $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'}++; } + + # Sequence context - 6 mutation types - genomic strand + my $mutationSeqContext6mutType = "T>G"; + # We want to express the mutation in T> + if( ($tab[$ref_value] eq "A") && ($tab[$alt_value] eq "T") ) + { + my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]); + my $context_reverse = $base5."_".$base3; + if(exists $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType}) { $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType}++; } + } + elsif(exists $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType}) { $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType}++; } + + # Strand analysis T>G on NonTr strand + if( (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "T")&&($tab[$alt_value] eq "G"))) || (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "A")&&($tab[$alt_value] eq "C"))) ) + { + if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'}) { $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'}++; } + + # T>G (T>G strand = +) + if( ($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "T")&&($tab[$alt_value] eq "G")) ) + { + if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{'T>G'}{'NonTr'}) { $refH_file->{$filename}{'SeqContextC'}{$context}{'T>G'}{'NonTr'}++; } + } + # T>G (A>C strand = -) + else + { + my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]); + my $context_reverse = $base5."_".$base3; + if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'T>G'}{'NonTr'}) { $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'T>G'}{'NonTr'}++; } + } + } + # Strand analysis T>G on Tr strand + if( (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "T")&&($tab[$alt_value] eq "G"))) || (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq "A")&&($tab[$alt_value] eq "C"))) ) + { + if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'}) { $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'}++; } + + # T>G (T>G strand = -) + if( ($tab[$strand_value] eq "-") && (($tab[$ref_value] eq "T")&&($tab[$alt_value] eq "G")) ) + { + if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{'T>G'}{'Tr'}) { $refH_file->{$filename}{'SeqContextC'}{$context}{'T>G'}{'Tr'}++; } + } + # T>G (A>C strand = +) + else + { + my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]); + my $context_reverse = $base5."_".$base3; + if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'T>G'}{'Tr'}) { $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{'T>G'}{'Tr'}++; } + } + } + # WebLogo-3 + if(($tab[$ref_value] eq "T") && ($tab[$alt_value] eq "G")) + { + if(scalar(@tempContextSequence) == 2) { next; } + my ($contextTemp1, $contextTemp2) = ("", ""); + for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= $tempContextSequence[$i]; } + for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= $tempContextSequence[$i]; } + my $context = $contextTemp1."T".$contextTemp2; $context = reverse $context; + push(@{$refH_file->{$filename}{'WebLogo3'}{'TG'}}, $context); + } + else + { + if(scalar(@tempContextSequence) == 2) { next; } + my ($contextTemp1, $contextTemp2) = ("", ""); + for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= complement($tempContextSequence[$i]); } + for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= complement($tempContextSequence[$i]); } + my $context = $contextTemp1."T".$contextTemp2; + push(@{$refH_file->{$filename}{'WebLogo3'}{'TG'}}, $context); + } + } + } + close F1; + } + # Write the different statistics in the report + sub WriteStatistics + { + my ($refH_file, $nb_func, $folderFigure, $folderChi2, $folderNMF) = @_; + + # Save the different graphs in specific folder instead of in a general one. + if(!-e "$folderFigure/Overall_mutation_distribution") { mkdir("$folderFigure/Overall_mutation_distribution") or die "Can't create the directory $folderFigure/Overall_mutation_distribution\n"; } + if(!-e "$folderFigure/Impact_protein_sequence") { mkdir("$folderFigure/Impact_protein_sequence") or die "Can't create the directory $folderFigure/Impact_protein_sequence\n"; } + if(!-e "$folderFigure/SBS_distribution") { mkdir("$folderFigure/SBS_distribution") or die "Can't create the directory $folderFigure/SBS_distribution\n"; } + if(!-e "$folderFigure/Stranded_Analysis") { mkdir("$folderFigure/Stranded_Analysis") or die "Can't create the directory $folderFigure/Stranded_Analysis\n"; } + if(!-e "$folderFigure/Trinucleotide_Sequence_Context") { mkdir("$folderFigure/Trinucleotide_Sequence_Context") or die "Can't create the directory $folderFigure/Trinucleotide_Sequence_Context\n"; } + if(!-e "$folderFigure/Distribution_SBS_Per_Chromosomes") { mkdir("$folderFigure/Distribution_SBS_Per_Chromosomes") or die "Can't create the directory $folderFigure/Distribution_SBS_Per_Chromosomes\n"; } + + + # Create a workbook with all the samples + my $wb = ""; my $ws_sum = ""; my %h_chi2 = (); + my ($ws_inputNMF_count, $ws_inputNMF_percent) = ("", ""); + ############### Define the format + my ($format_A10, $format_A10Boldleft, $format_A10ItalicRed) = ("", "", ""); + my ($formatT_left, $formatT_right, $formatT_bottomRight, $formatT_bottomLeft, $formatT_bottom, $formatT_bottomHeader, $formatT_bottomRightHeader, $formatT_bottomHeader2, $formatT_rightHeader); + my ($formatT_graphTitle); + my ($table_topleft, $table_topRight, $table_bottomleft, $table_bottomRight, $table_top, $table_right, $table_bottom, $table_bottomItalicRed, $table_left, $table_bottomrightHeader, $table_left2, $table_middleHeader, $table_middleHeader2); + + if($oneReportPerSample == 2) + { + $wb = Spreadsheet::WriteExcel->new("$output/Report_Mutation_Spectra.xls"); + + ############### Define the format + Format_A10($wb, \$format_A10); # Text center in Arial 10 + Format_A10BoldLeft($wb, \$format_A10Boldleft); # Text on the left in Arial 10 bold + Format_TextSection($wb, \$formatT_left, \$formatT_right, \$formatT_bottomRight, \$formatT_bottomLeft, \$formatT_bottom, \$formatT_bottomHeader, \$formatT_bottomRightHeader, \$formatT_bottomHeader2, \$formatT_rightHeader); + Format_GraphTitle($wb, \$formatT_graphTitle); + Format_Table($wb, \$table_topleft, \$table_topRight, \$table_bottomleft, \$table_bottomRight, \$table_top, \$table_right, \$table_bottom, \$table_bottomItalicRed, \$table_left, \$table_bottomrightHeader, \$table_left2, \$table_middleHeader, \$table_middleHeader2); + Format_A10ItalicRed($wb, \$format_A10ItalicRed); + + + ############### Worksheet with a summary of the samples + $ws_sum = $wb->add_worksheet("Sample_List"); + $ws_sum->write(0, 0, "Samples", $format_A10); $ws_sum->write(0, 1, "Total number SBS", $format_A10); $ws_sum->write(0, 2, "Total number of Indel", $format_A10); $ws_sum->write(0, 3, "Total number of mutations", $format_A10); + $ws_sum->set_column(0,0, 50); $ws_sum->set_column(1,1, 20); $ws_sum->set_column(2,2, 20); $ws_sum->set_column(3,3, 22); + + ############### Save the chi2 values into a hash table + if(-e "$folderChi2/Output_chi2_strandBias.txt") + { + open(F1, "$folderChi2/Output_chi2_strandBias.txt") or die "$!: $folderChi2/Output_chi2_strandBias.txt\n"; + my $header = <F1>; # Strand_Bias($tab[0]) NonTr-Tr($tab[1]) Proportion($tab[2]) P-val-Chi2($tab[3]) FDR($tab[4]) Confidence Interval($tab[5]) Mutation_Type($tab[6]) SampleName($tab[7]) + while(<F1>) + { + $_ =~ s/[\r\n]+$//; + my @tab = split("\t", $_); + + $h_chi2{$tab[7]}{$tab[6]}{'p-value'} = $tab[3]; $h_chi2{$tab[7]}{$tab[6]}{'ConfInt'} = $tab[5]; + + # For the pool data the FDR isn't calculated so replace the NA (=Missing values) by "-" + if($tab[7] eq "Pool_Data") { $h_chi2{$tab[7]}{$tab[6]}{'FDR'} = "-"; } + else { $h_chi2{$tab[7]}{$tab[6]}{'FDR'} = $tab[4]; } + } + close F1; + } + ############### Write the input matrix for NMF for the count and the un-normalized frequency + $ws_inputNMF_count = $wb->add_worksheet("Input_NMF_Count"); + $ws_inputNMF_percent = $wb->add_worksheet("Input_NMF_Percent"); + } + + + ################################################ Set the Rows and columns of the different part of the report ################################################ + my $row_SumSheet = 1; # First row for the summary sheet of the report + my $rowStart_SBSdistrBySeg = 48; my $colStart_SBSdistrBySeg = 0; # For the table SBS distribution by segment + my $colStart_matrixSeqContext = 19; # Sequence context + my $col_inputNMF = 0; # Write the names of the samples with at least 33 SBS + + # For NMF input + my %h_inputNMF = (); + + ## For each file + foreach my $k_file (sort keys $refH_file) + { + print "File in process: $k_file\n"; + if($k_file ne "Pool_Data") { $col_inputNMF++; } + + # Create one workbook for each sample + if($oneReportPerSample == 1) + { + $wb = Spreadsheet::WriteExcel->new("$output/Report_Mutation_Spectra-$k_file.xls"); + + ############### Define the format + Format_A10($wb, \$format_A10); # Text center in Arial 10 + Format_A10BoldLeft($wb, \$format_A10Boldleft); # Text on the left in Arial 10 bold + Format_TextSection($wb, \$formatT_left, \$formatT_right, \$formatT_bottomRight, \$formatT_bottomLeft, \$formatT_bottom, \$formatT_bottomHeader, \$formatT_bottomRightHeader, \$formatT_bottomHeader2, \$formatT_rightHeader); + Format_GraphTitle($wb, \$formatT_graphTitle); + Format_Table($wb, \$table_topleft, \$table_topRight, \$table_bottomleft, \$table_bottomRight, \$table_top, \$table_right, \$table_bottom, \$table_bottomItalicRed, \$table_left, \$table_bottomrightHeader, \$table_left2, \$table_middleHeader, \$table_middleHeader2); + Format_A10ItalicRed($wb, \$format_A10ItalicRed); + + + ############### Worksheet with a summary of the samples + $ws_sum = $wb->add_worksheet("Sample_List"); + $ws_sum->write(0, 0, "Samples", $format_A10); $ws_sum->write(0, 1, "Total number SBS", $format_A10); $ws_sum->write(0, 2, "Total number of Indel", $format_A10); $ws_sum->write(0, 3, "Total number of mutations", $format_A10); + $ws_sum->set_column(0,0, 50); $ws_sum->set_column(1,1, 20); $ws_sum->set_column(2,2, 20); $ws_sum->set_column(3,3, 22); + # Write in the Samples sheet the name and the total number of SBS + $ws_sum->write(1, 0, "$k_file", $format_A10); + $ws_sum->write(1, 1, $refH_file->{$k_file}{'TotalSBSGenomic'}, $format_A10); $ws_sum->write(1, 2, $refH_file->{$k_file}{'TotalIndelGenomic'}, $format_A10); $ws_sum->write($row_SumSheet, 3, $refH_file->{$k_file}{'TotalMutGenomic'}, $format_A10); + + ############### Save the chi2 values into a hash table + if(-e "$folderChi2/Output_chi2_strandBias.txt") + { + open(F1, "$folderChi2/Output_chi2_strandBias.txt") or die "$!: $folderChi2/Output_chi2_strandBias.txt\n"; + my $header = <F1>; # Strand_Bias($tab[0]) NonTr-Tr($tab[1]) Proportion($tab[2]) P-val-Chi2($tab[3]) FDR($tab[4]) Confidence Interval($tab[5]) Mutation_Type($tab[6]) SampleName($tab[7]) + while(<F1>) + { + $_ =~ s/[\r\n]+$//; + my @tab = split("\t", $_); + + if($tab[7] eq $k_file) + { + $h_chi2{$tab[7]}{$tab[6]}{'p-value'} = $tab[3]; $h_chi2{$tab[7]}{$tab[6]}{'ConfInt'} = $tab[5]; + + # For the pool data the FDR isn't calculated so replace the NA (=Missing values) by "-" + if($tab[7] eq "Pool_Data") { $h_chi2{$tab[7]}{$tab[6]}{'FDR'} = "-"; } + else { $h_chi2{$tab[7]}{$tab[6]}{'FDR'} = $tab[4]; } + } + } + close F1; + } + + ############### Write the input matrix for NMF + if($k_file ne "Pool_Data") + { + # For NMF don't consider the pool of the samples + $ws_inputNMF_count = $wb->add_worksheet("Input_NMF_Count"); + $ws_inputNMF_percent = $wb->add_worksheet("Input_NMF_Percent"); + # Write in the input NMF sheet the name of the samples + $ws_inputNMF_count->write(0, 1, $k_file); + $ws_inputNMF_percent->write(0, 1, $k_file); + } + } + # One workbook with all the samples + else + { + # Write in the Samples sheet the name and the total number of SBS + $ws_sum->write($row_SumSheet, 0, $k_file, $format_A10); + $ws_sum->write($row_SumSheet, 1, $refH_file->{$k_file}{'TotalSBSGenomic'}, $format_A10); $ws_sum->write($row_SumSheet, 2, $refH_file->{$k_file}{'TotalIndelGenomic'}, $format_A10); $ws_sum->write($row_SumSheet, 3, $refH_file->{$k_file}{'TotalMutGenomic'}, $format_A10); + + # For NMF don't consider the pool of the samples + if($k_file ne "Pool_Data") + { + # Write in the input NMF sheet the name of the samples + $ws_inputNMF_count->write(0, $col_inputNMF, $k_file); + $ws_inputNMF_percent->write(0, $col_inputNMF, $k_file); + } + } + + # Count of SBS per chromosome + PearsonCoefficient($refH_file, $k_file); + + # Add a worksheet to the workbook + my $ws = $wb->add_worksheet($k_file); + + # Write the titles of the different sections of the report + WriteBoderSection($wb, $ws, $rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, $nb_func, $colStart_matrixSeqContext); + + # Write the mutation types (6 types) + WriteHeaderSection($wb, $ws, $rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, $nb_func, $colStart_matrixSeqContext); + + + # Save the figures of each samples in a different folder + if(!-e "$folderFigure/Overall_mutation_distribution/$k_file") { mkdir("$folderFigure/Overall_mutation_distribution/$k_file") or die "Can't create the directory $folderFigure/Overall_mutation_distribution/$k_file\n"; } + if(!-e "$folderFigure/Impact_protein_sequence/$k_file") { mkdir("$folderFigure/Impact_protein_sequence/$k_file") or die "Can't create the directory $folderFigure/Impact_protein_sequence/$k_file\n"; } + if(!-e "$folderFigure/SBS_distribution/$k_file") { mkdir("$folderFigure/SBS_distribution/$k_file") or die "Can't create the directory $folderFigure/SBS_distribution\n"; } + if(!-e "$folderFigure/Stranded_Analysis/$k_file") { mkdir("$folderFigure/Stranded_Analysis/$k_file") or die "Can't create the directory $folderFigure/Stranded_Analysis/$k_file\n"; } + if(!-e "$folderFigure/Trinucleotide_Sequence_Context/$k_file") { mkdir("$folderFigure/Trinucleotide_Sequence_Context/$k_file") or die "Can't create the directory $folderFigure/Trinucleotide_Sequence_Context/$k_file\n"; } + + + + ########################################################################################################################################################### + ################################################################# Write the statistics ################################################################### + ########################################################################################################################################################### + my ($ca_genomique, $cg_genomique, $ct_genomique, $ta_genomique, $tc_genomique, $tg_genomique) = (0,0,0,0,0,0); + my ($ca_NonTr, $ca_Tr, $cg_NonTr, $cg_Tr, $ct_NonTr, $ct_Tr, $ta_NonTr, $ta_Tr, $tc_NonTr, $tc_Tr, $tg_NonTr, $tg_Tr) = (0,0,0,0,0,0, 0,0,0,0,0,0); + + my $row_SBSdistrBySeg = $rowStart_SBSdistrBySeg+4; + my $row_SBSDistrBySegAndFunc_CA = $rowStart_SBSdistrBySeg+$nb_func+12; + my $row_SBSDistrBySegAndFunc_CG = $rowStart_SBSdistrBySeg+($nb_func*2)+16; my $rowEndCG_SBSDistrBySegAndFunc_CG = $row_SBSDistrBySegAndFunc_CG+$nb_func; + my $row_SBSDistrBySegAndFunc_CT = $rowStart_SBSdistrBySeg+($nb_func*3)+20; + + ## 6 mutation types by segment + foreach my $k_func (sort keys $refH_file->{$k_file}{'6mutType'}) + { + my $totalSBS_bySegment = 0; + + # Write the functional region for the section SBS distribution by segment + $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg, $k_func, $formatT_left); + + # Write the exonic func for the section strand bias by segment + $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg, $k_func, $formatT_left); + + if($row_SBSDistrBySegAndFunc_CG == $rowEndCG_SBSDistrBySegAndFunc_CG) + { + $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg, $k_func, $formatT_bottomLeft); + } + else + { + $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg, $k_func, $formatT_left); + } + + foreach my $k_mutation (sort keys $refH_file->{$k_file}{'6mutType'}{$k_func}) + { + if($k_mutation eq "C:G>A:T") + { + # Write the ratio NonTr(CA)/Tr(GT) + my $ratioSB = 0; + if( ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} == 0) || ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'} == 0) ) { $ratioSB = 0; } + else { $ratioSB = $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} / $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; } + $ratioSB = sprintf("%.2f", $ratioSB); + $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+1, $ratioSB, $format_A10); + + # Write the count of SBS in the NonTr and Tr strand + $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+2, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}, $format_A10); + $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+3, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}, $format_A10); + + # Calculate the total number of SBS per mut type (genomic strand) + $ca_genomique += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}; + # Calculate the total number of SBS by NonTr / Tr strand + $ca_NonTr += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}; $ca_Tr += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; + + # Write the count by exonic region + $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+3, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}, $format_A10); + } + if($k_mutation eq "C:G>G:C") + { + # Write the ratio NonTr(CG)/Tr(GC) + my $ratioSB = 0; + if( ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} == 0) || ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'} == 0) ) { $ratioSB = 0; } + else { $ratioSB = $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} / $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; } + $ratioSB = sprintf("%.2f", $ratioSB); + $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+5, $ratioSB, $format_A10); + + # Write the count of SBS in the NonTr and Tr strand + $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+6, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}, $format_A10); + $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+7, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}, $format_A10); + + # Calculate the total number of SBS per mut type (genomic strand) + $cg_genomique += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}; + # Calculate the total number of SBS by NonTr / Tr strand + $cg_NonTr += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}; $cg_Tr += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; + + # Write the count by exonic region + $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+5, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}, $format_A10); + } + if($k_mutation eq "C:G>T:A") + { + # Write the ratio NonTr(CT)/Tr(GA) + my $ratioSB = 0; + if( ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} == 0) || ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'} == 0) ) { $ratioSB = 0; } + else { $ratioSB = $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} / $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; } + $ratioSB = sprintf("%.2f", $ratioSB); + $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+9, $ratioSB, $format_A10); + + # Write the count of SBS in the NonTr and Tr strand + $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+10, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}, $format_A10); + $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+11, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}, $formatT_right); + + # Calculate the total number of SBS per mut type (genomic strand) + $ct_genomique += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}; + # Calculate the total number of SBS by NonTr / Tr strand + $ct_NonTr += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}; $ct_Tr += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; + + # Write the count by exonic region + $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+7, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}, $format_A10); + } + if($k_mutation eq "T:A>A:T") + { + # Write the ratio NonTr(AT)/Tr(TA) + my $ratioSB = 0; + if( ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} == 0) || ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'} == 0) ) { $ratioSB = 0; } + else { $ratioSB = $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} / $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; } + $ratioSB = sprintf("%.2f", $ratioSB); + + + if($row_SBSDistrBySegAndFunc_CG == $rowEndCG_SBSDistrBySegAndFunc_CG) + { + # Write the ratio NonTr(AC)/Tr(TG) + $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+1, $ratioSB, $formatT_bottom); + # Write the count of SBS in the NonTr and Tr strand + $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+2, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}, $formatT_bottom); + $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+3, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}, $formatT_bottom); + } + else + { + # Write the ratio NonTr(AC)/Tr(TG) + $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+1, $ratioSB, $format_A10); + # Write the count of SBS in the NonTr and Tr strand + $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+2, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}, $format_A10); + $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+3, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}, $format_A10); + } + + + # Calculate the total number of SBS per mut type (genomic strand) + $ta_genomique += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}; + # Calculate the total number of SBS by NonTr / Tr strand + $ta_NonTr += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}; $ta_Tr += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; + + # Write the count by exonic region + $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+9, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}, $format_A10); + } + if($k_mutation eq "T:A>C:G") + { + # Write the ratio NonTr(AG)/Tr(TC) + my $ratioSB = 0; + if( ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} == 0) || ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'} == 0) ) { $ratioSB = 0; } + else { $ratioSB = $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} / $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; } + $ratioSB = sprintf("%.2f", $ratioSB); + + if($row_SBSDistrBySegAndFunc_CG == $rowEndCG_SBSDistrBySegAndFunc_CG) + { + # Write the ratio NonTr(AC)/Tr(TG) + $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+5, $ratioSB, $formatT_bottom); + # Write the count of SBS in the NonTr and Tr strand + $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+6, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}, $formatT_bottom); + $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+7, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}, $formatT_bottom); + } + else + { + # Write the ratio NonTr(AC)/Tr(TG) + $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+5, $ratioSB, $format_A10); + # Write the count of SBS in the NonTr and Tr strand + $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+6, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}, $format_A10); + $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+7, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}, $format_A10); + } + + # Calculate the total number of SBS per mut type (genomic strand) + $tc_genomique += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}; + # Calculate the total number of SBS by NonTr / Tr strand + $tc_NonTr += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}; $tc_Tr += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; + + # Write the count by exonic region + $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+11, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}, $format_A10); + } + if($k_mutation eq "T:A>G:C") + { + # Calculate the ratio for the strand bias + my $ratioSB = 0; + if( ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} == 0) || ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'} == 0) ) { $ratioSB = 0; } + else { $ratioSB = $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} / $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; } + $ratioSB = sprintf("%.2f", $ratioSB); + + if($row_SBSDistrBySegAndFunc_CG == $rowEndCG_SBSDistrBySegAndFunc_CG) + { + # Write the ratio NonTr(AC)/Tr(TG) + $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+9, $ratioSB, $formatT_bottom); + # Write the count of SBS in the NonTr and Tr strand + $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+10, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}, $formatT_bottom); + $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+11, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}, $formatT_bottomRight); + } + else + { + # Write the ratio NonTr(AC)/Tr(TG) + $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+9, $ratioSB, $format_A10); + # Write the count of SBS in the NonTr and Tr strand + $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+10, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}, $format_A10); + $ws->write($row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg+11, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}, $formatT_right); + } + + # Calculate the total number of SBS per mut type (genomic strand) + $tg_genomique += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}; + # Calculate the total number of SBS by NonTr / Tr strand + $tg_NonTr += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}; $tg_Tr += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; + + # Write the count by exonic region + $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+13, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}, $formatT_right); + } + + # Calculate the total number of SBS on the genomic strand for each mutation types by exonic region + $totalSBS_bySegment += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}; + } # End $k_mutation + + $row_SBSDistrBySegAndFunc_CA++; $row_SBSDistrBySegAndFunc_CG++; $row_SBSDistrBySegAndFunc_CT++; + + # Write the percent by exonic region + my $percent_ca = 0; + if($refH_file->{$k_file}{'6mutType'}{$k_func}{'C:G>A:T'}{'TotalMutG'} == 0) { $percent_ca = 0; } + else { $percent_ca = ($refH_file->{$k_file}{'6mutType'}{$k_func}{'C:G>A:T'}{'TotalMutG'} / $totalSBS_bySegment ) * 100; $percent_ca = sprintf("%.2f", $percent_ca); } + $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+2, $percent_ca, $format_A10); + my $percent_cg = 0; + if($refH_file->{$k_file}{'6mutType'}{$k_func}{'C:G>A:T'}{'TotalMutG'} == 0) { $percent_cg = 0; } + else { $percent_cg = ($refH_file->{$k_file}{'6mutType'}{$k_func}{'C:G>G:C'}{'TotalMutG'} / $totalSBS_bySegment ) * 100; $percent_cg = sprintf("%.2f", $percent_cg); } + $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+4, $percent_cg, $format_A10); + my $percent_ct = 0; + if($refH_file->{$k_file}{'6mutType'}{$k_func}{'C:G>A:T'}{'TotalMutG'} == 0) { $percent_ct = 0; } + else { $percent_ct = ($refH_file->{$k_file}{'6mutType'}{$k_func}{'C:G>T:A'}{'TotalMutG'} / $totalSBS_bySegment ) * 100; $percent_ct = sprintf("%.2f", $percent_ct); } + $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+6, $percent_ct, $format_A10); + my $percent_ta = 0; + if($refH_file->{$k_file}{'6mutType'}{$k_func}{'C:G>A:T'}{'TotalMutG'} == 0) { $percent_ta = 0; } + else { $percent_ta = ($refH_file->{$k_file}{'6mutType'}{$k_func}{'T:A>A:T'}{'TotalMutG'} / $totalSBS_bySegment ) * 100; $percent_ta = sprintf("%.2f", $percent_ta); } + $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+8, $percent_ta, $format_A10); + my $percent_tc = 0; + if($refH_file->{$k_file}{'6mutType'}{$k_func}{'C:G>A:T'}{'TotalMutG'} == 0) { $percent_tc = 0; } + else { $percent_tc = ($refH_file->{$k_file}{'6mutType'}{$k_func}{'T:A>C:G'}{'TotalMutG'} / $totalSBS_bySegment ) * 100; $percent_tc = sprintf("%.2f", $percent_tc); } + $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+10, $percent_tc, $format_A10); + my $percent_tg = 0; + if($refH_file->{$k_file}{'6mutType'}{$k_func}{'C:G>A:T'}{'TotalMutG'} == 0) { $percent_tg = 0; } + else { $percent_tg = ($refH_file->{$k_file}{'6mutType'}{$k_func}{'T:A>G:C'}{'TotalMutG'} / $totalSBS_bySegment ) * 100; $percent_tg = sprintf("%.2f", $percent_tg); } + $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+12, $percent_tg, $format_A10); + + # Write the count of SBS by segment + $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+1, $totalSBS_bySegment, $format_A10); + + $row_SBSdistrBySeg++; + } # End $k_func + + # Write the total number of SBS on the genomic strand + $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+1, $refH_file->{$k_file}{'TotalSBSGenomic'}, $formatT_bottomHeader); + + # Write the total and the percentage of SBS for each mutation types and save it to a text file + open(DISTRSBS, ">", "$folderFigure/SBS_distribution/$k_file/$k_file-SBS_distribution.txt") or die "$!: $folderFigure/SBS_distribution/$k_file/$k_file-SBS_distribution.txt\n"; + print DISTRSBS "Mutation_Type\tCount\tPercentage\tSample\n"; + my $percent_ca = 0; + if($ca_genomique == 0) { $percent_ca = 0; } + else { $percent_ca = ($ca_genomique/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; $percent_ca = sprintf("%.2f", $percent_ca); } + $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+2, $percent_ca, $formatT_bottom); print DISTRSBS "C:G>A:T\t$ca_genomique\t$percent_ca\t$k_file\n"; + $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+3, $ca_genomique, $formatT_bottomHeader); + my $percent_cg = 0; + if($cg_genomique == 0) { $percent_cg = 0; } + else { $percent_cg = ($cg_genomique/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; $percent_cg = sprintf("%.2f", $percent_cg); } + $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+4, $percent_cg, $formatT_bottom); print DISTRSBS "C:G>G:C\t$cg_genomique\t$percent_cg\t$k_file\n"; + $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+5, $cg_genomique, $formatT_bottomHeader); + my $percent_ct = 0; + if($ct_genomique == 0) { $percent_ct = 0; } + else { $percent_ct = ($ct_genomique/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; $percent_ct = sprintf("%.2f", $percent_ct); } + $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+6, $percent_ct, $formatT_bottom); print DISTRSBS "C:G>T:A\t$ct_genomique\t$percent_ct\t$k_file\n"; + $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+7, $ct_genomique, $formatT_bottomHeader); + my $percent_ta = 0; + if($ta_genomique == 0) { $percent_ta = 0; } + else { $percent_ta = ($ta_genomique/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; $percent_ta = sprintf("%.2f", $percent_ta); } + $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+8, $percent_ta, $formatT_bottom); print DISTRSBS "T:A>A:T\t$ta_genomique\t$percent_ta\t$k_file\n"; + $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+9, $ta_genomique, $formatT_bottomHeader); + my $percent_tc = 0; + if($tc_genomique == 0) { $percent_tc = 0; } + else { $percent_tc = ($tc_genomique/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; $percent_tc = sprintf("%.2f", $percent_tc); } + $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+10, $percent_tc, $formatT_bottom); print DISTRSBS "T:A>C:G\t$tc_genomique\t$percent_tc\t$k_file\n"; + $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+11, $tc_genomique, $formatT_bottomHeader); + my $percent_tg = 0; + if($tg_genomique == 0) { $percent_tg = 0; } + else { $percent_tg = ($tg_genomique/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; $percent_tg = sprintf("%.2f", $percent_tg); } + $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+12, $percent_tg, $formatT_bottom); print DISTRSBS "T:A>G:C\t$tg_genomique\t$percent_tg\t$k_file\n"; + $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+13, $tg_genomique, $formatT_bottomRightHeader); + close DISTRSBS; + + ########################################################################################################################################################### + ################################################################### Write Strand BIAS ##################################################################### + ########################################################################################################################################################### + # Write the SB for each mutation type (table 3) + $ws->write(28, 11, "Table 3. Significance of the strand biases", $format_A10Boldleft); + $ws->set_column(11, 11, 13); $ws->set_column(16, 16, 15); $ws->set_column(17, 17, 10); + $ws->write(29, 11, "Mutation Type", $table_topleft); $ws->write(29, 12, "Non-Tr/Tr", $table_top); $ws->write(29, 13, "Non-Tr", $table_top); $ws->write(29, 14, "Tr", $table_top); $ws->write(29, 15, "P-value", $table_top); $ws->write(29, 16, "FDR q value", $table_top); $ws->write(29, 17, "95% CI", $table_topRight); + + $ws->write(39, 11, "Table 3. Significance of the strand biases", $format_A10Boldleft); + $ws->write(40, 11, "Mutation Type", $table_topleft); $ws->write(40, 12, "Non-Tr/Tr", $table_top); $ws->write(40, 13, "Non-Tr", $table_top); $ws->write(40, 14, "Tr", $table_top); $ws->write(40, 15, "P-value", $table_top); $ws->write(40, 16, "FDR q value", $table_top); $ws->write(40, 17, "95% CI", $table_topRight); + + # For ggplot2 + open(SB, ">", "$folderFigure/Stranded_Analysis/$k_file/$k_file-StrandBias.txt") or die "$!: $folderFigure/Stranded_Analysis/$k_file/$k_file-StrandBias.txt\n"; + print SB "Alteration\tStrand\tCount\n"; + + + #-----------------------------------------------------------------------------------------------------# + my ($ratio_ca, $ratio_gt, $percent_ca_NonTr, $percent_ca_Tr) = (0, 0, 0, 0, 0); + if( ($ca_NonTr==0) || ($ca_Tr==0) ) { $ratio_ca = 0; $ratio_gt = 0; $percent_ca_NonTr = 0; $percent_ca_Tr = 0; } + else + { + $ratio_ca = $ca_NonTr/$ca_Tr; $ratio_ca = sprintf("%.2f", $ratio_ca); + $ratio_gt = $ca_Tr/$ca_NonTr; $ratio_gt = sprintf("%.2f", $ratio_gt); + $percent_ca_NonTr = ($ca_NonTr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; $percent_ca_Tr = ($ca_Tr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; + } + print SB "C>A\tNonTranscribed\t$ca_NonTr\n", "C>A\tTranscribed\t$ca_Tr\n"; + # C>A + $ws->write(30, 11, "C>A", $table_left); $ws->write(30, 12, $ratio_ca, $table_middleHeader); $ws->write(30, 13, $ca_NonTr, $format_A10); $ws->write(30, 14, $ca_Tr, $format_A10); + # Write in italic and red (= warning message) when the count of NonTr + Tr is lower than 10 + if(($ca_NonTr+$ca_Tr)< 10) + { + if($h_chi2{$k_file}{'C>A'}{'p-value'} eq "NA") { $ws->write_string(30, 15, $h_chi2{$k_file}{'C>A'}{'p-value'}, $format_A10); } + else { $ws->write_string(30, 15, $h_chi2{$k_file}{'C>A'}{'p-value'}, $format_A10ItalicRed); } + } + else { $ws->write_string(30, 15, $h_chi2{$k_file}{'C>A'}{'p-value'}, $format_A10); } + $ws->write(30, 16, $h_chi2{$k_file}{'C>A'}{'FDR'}, $format_A10); $ws->write(30, 17, $h_chi2{$k_file}{'C>A'}{'ConfInt'}, $table_right); + # G>T + $ws->write(41, 11, "G>T", $table_left); $ws->write(41, 12, $ratio_gt, $table_middleHeader); $ws->write(41, 13, $ca_Tr, $format_A10); $ws->write(41, 14, $ca_NonTr, $format_A10); + if(($ca_NonTr+$ca_Tr)< 10) + { + if($h_chi2{$k_file}{'C>A'}{'p-value'} eq "NA") { $ws->write_string(41, 15, $h_chi2{$k_file}{'C>A'}{'p-value'}, $format_A10); } + else { $ws->write_string(41, 15, $h_chi2{$k_file}{'C>A'}{'p-value'}, $format_A10ItalicRed); } + } + else { $ws->write_string(41, 15, $h_chi2{$k_file}{'C>A'}{'p-value'}, $format_A10); } + $ws->write(41, 16, $h_chi2{$k_file}{'C>A'}{'FDR'}, $format_A10); $ws->write(41, 17, $h_chi2{$k_file}{'C>A'}{'ConfInt'}, $table_right); + + #-----------------------------------------------------------------------------------------------------# + my ($ratio_cg, $ratio_gc, $percent_cg_NonTr, $percent_cg_Tr) = (0, 0, 0, 0, 0); + if( ($cg_NonTr==0) || ($cg_Tr==0) ) { $ratio_cg = 0; $ratio_gc = 0; $percent_cg_NonTr = 0; $percent_cg_Tr = 0; } + else + { + $ratio_cg = $cg_NonTr/$cg_Tr; $ratio_cg = sprintf("%.2f", $ratio_cg); + $ratio_gc = $cg_Tr/$cg_NonTr; $ratio_gc = sprintf("%.2f", $ratio_gc); + $percent_cg_NonTr = ($cg_NonTr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; $percent_cg_Tr = ($cg_Tr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; + } + print SB "C>G\tNonTranscribed\t$cg_NonTr\n", "C>G\tTranscribed\t$cg_Tr\n"; + # C>G + $ws->write(31, 11, "C>G", $table_left); $ws->write(31, 12, $ratio_cg, $table_middleHeader); $ws->write(31, 13, $cg_NonTr, $format_A10); $ws->write(31, 14, $cg_Tr, $format_A10); + # Write in italic and red (= warning message) when the count of NonTr + Tr is lower than 10 + if(($cg_NonTr+$cg_Tr)< 10) + { + if($h_chi2{$k_file}{'C>G'}{'p-value'} eq "NA") { $ws->write_string(31, 15, $h_chi2{$k_file}{'C>G'}{'p-value'}, $format_A10); } + else { $ws->write_string(31, 15, $h_chi2{$k_file}{'C>G'}{'p-value'}, $format_A10ItalicRed); } + } + else { $ws->write_string(31, 15, $h_chi2{$k_file}{'C>G'}{'p-value'}, $format_A10); } + $ws->write(31, 16, $h_chi2{$k_file}{'C>G'}{'FDR'}, $format_A10); $ws->write(31, 17, $h_chi2{$k_file}{'C>G'}{'ConfInt'}, $table_right); + # G>C + $ws->write(42, 11, "G>C", $table_left); $ws->write(42, 12, $ratio_gc, $table_middleHeader); $ws->write(42, 13, $cg_Tr, $format_A10); $ws->write(42, 14, $cg_NonTr, $format_A10); + if(($cg_NonTr+$cg_Tr)< 10) + { + if($h_chi2{$k_file}{'C>G'}{'p-value'} eq "NA") { $ws->write_string(42, 15, $h_chi2{$k_file}{'C>G'}{'p-value'}, $format_A10); } + else { $ws->write_string(42, 15, $h_chi2{$k_file}{'C>G'}{'p-value'}, $format_A10ItalicRed); } + } + else { $ws->write_string(42, 15, $h_chi2{$k_file}{'C>G'}{'p-value'}, $format_A10); } + $ws->write(42, 16, $h_chi2{$k_file}{'C>G'}{'FDR'}, $format_A10); $ws->write(42, 17, $h_chi2{$k_file}{'C>G'}{'ConfInt'}, $table_right); + + #-----------------------------------------------------------------------------------------------------# + my ($ratio_ct, $ratio_ga, $percent_ct_NonTr, $percent_ct_Tr) = (0, 0, 0, 0, 0); + if( ($ct_NonTr==0) || ($ct_Tr==0) ) { $ratio_ct = 0; $ratio_ga = 0; $percent_ct_NonTr = 0; $percent_ct_Tr = 0; } + else + { + $ratio_ct = $ct_NonTr/$ct_Tr; $ratio_ct = sprintf("%.2f", $ratio_ct); + $ratio_ga = $ct_Tr/$ct_NonTr; $ratio_ga = sprintf("%.2f", $ratio_ga); + $percent_ct_NonTr = ($ct_NonTr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; $percent_ct_Tr = ($ct_Tr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; + } + print SB "C>T\tNonTranscribed\t$ct_NonTr\n", "C>T\tTranscribed\t$ct_Tr\n"; + # C>T + $ws->write(32, 11, "C>T", $table_left); $ws->write(32, 12, $ratio_ct, $table_middleHeader); $ws->write(32, 13, $ct_NonTr, $format_A10); $ws->write(32, 14, $ct_Tr, $format_A10); + # Write in italic and red (= warning message) when the count of NonTr + Tr is lower than 10 + if(($ct_NonTr+$ct_Tr)< 10) + { + if($h_chi2{$k_file}{'C>T'}{'p-value'} eq "NA") { $ws->write_string(32, 15, $h_chi2{$k_file}{'C>T'}{'p-value'}, $format_A10); } + else { $ws->write_string(32, 15, $h_chi2{$k_file}{'C>T'}{'p-value'}, $format_A10ItalicRed); } + } + else { $ws->write_string(32, 15, $h_chi2{$k_file}{'C>T'}{'p-value'}, $format_A10); } + $ws->write(32, 16, $h_chi2{$k_file}{'C>T'}{'FDR'}, $format_A10); $ws->write(32, 17, $h_chi2{$k_file}{'C>T'}{'ConfInt'}, $table_right); + # G>A + $ws->write(43, 11, "G>A", $table_left); $ws->write(43, 12, $ratio_ga, $table_middleHeader); $ws->write(43, 13, $ct_Tr, $format_A10); $ws->write(43, 14, $ct_NonTr, $format_A10); + if(($ct_NonTr+$ct_Tr)< 10) + { + if($h_chi2{$k_file}{'C>T'}{'p-value'} eq "NA") { $ws->write_string(43, 15, $h_chi2{$k_file}{'C>T'}{'p-value'}, $format_A10); } + else { $ws->write_string(43, 15, $h_chi2{$k_file}{'C>T'}{'p-value'}, $format_A10ItalicRed); } + } + else { $ws->write_string(43, 15, $h_chi2{$k_file}{'C>T'}{'p-value'}, $format_A10); } + $ws->write(43, 16, $h_chi2{$k_file}{'C>T'}{'FDR'}, $format_A10); $ws->write(43, 17, $h_chi2{$k_file}{'C>T'}{'ConfInt'}, $table_right); + + #-----------------------------------------------------------------------------------------------------# + my ($ratio_ta, $ratio_at, $percent_ta_NonTr, $percent_ta_Tr) = (0, 0, 0, 0, 0); + if( ($ta_NonTr==0) || ($ta_Tr==0) ) { $ratio_ta = 0; $ratio_at = 0; $percent_ta_NonTr = 0; $percent_ta_Tr = 0; } + else + { + $ratio_ta = $ta_NonTr/$ta_Tr; $ratio_ta = sprintf("%.2f", $ratio_ta); + $ratio_at = $ta_Tr/$ta_NonTr; $ratio_at = sprintf("%.2f", $ratio_at); + $percent_ta_NonTr = ($ta_NonTr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; $percent_ta_Tr = ($ta_Tr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; + } + print SB "T>A\tNonTranscribed\t$ta_NonTr\n", "T>A\tTranscribed\t$ta_Tr\n"; + # T>A + $ws->write(33, 11, "T>A", $table_left); $ws->write(33, 12, $ratio_ta, $table_middleHeader); $ws->write(33, 13, $ta_NonTr, $format_A10); $ws->write(33, 14, $ta_Tr, $format_A10); + # Write in italic and red (= warning message) when the count of NonTr + Tr is lower than 10 + if(($ta_NonTr+$ta_Tr)< 10) + { + if($h_chi2{$k_file}{'T>A'}{'p-value'} eq "NA") { $ws->write_string(33, 15, $h_chi2{$k_file}{'T>A'}{'p-value'}, $format_A10); } + else { $ws->write_string(33, 15, $h_chi2{$k_file}{'T>A'}{'p-value'}, $format_A10ItalicRed); } + } + else { $ws->write_string(33, 15, $h_chi2{$k_file}{'T>A'}{'p-value'}, $format_A10); } + $ws->write(33, 16, $h_chi2{$k_file}{'T>A'}{'FDR'}, $format_A10); $ws->write(33, 17, $h_chi2{$k_file}{'T>A'}{'ConfInt'}, $table_right); + # A>T + $ws->write(44, 11, "A>T", $table_left); $ws->write(44, 12, $ratio_at, $table_middleHeader); $ws->write(44, 13, $ta_Tr, $format_A10); $ws->write(44, 14, $ta_NonTr, $format_A10); + if(($ta_NonTr+$ta_Tr)< 10) + { + if($h_chi2{$k_file}{'T>A'}{'p-value'} eq "NA") { $ws->write_string(44, 15, $h_chi2{$k_file}{'T>A'}{'p-value'}, $format_A10); } + else { $ws->write_string(44, 15, $h_chi2{$k_file}{'T>A'}{'p-value'}, $format_A10ItalicRed); } + } + else { $ws->write_string(44, 15, $h_chi2{$k_file}{'T>A'}{'p-value'}, $format_A10); } + $ws->write(44, 16, $h_chi2{$k_file}{'T>A'}{'FDR'}, $format_A10); $ws->write(44, 17, $h_chi2{$k_file}{'T>A'}{'ConfInt'}, $table_right); + + #-----------------------------------------------------------------------------------------------------# + my ($ratio_tc, $ratio_ag, $percent_tc_NonTr, $percent_tc_Tr) = (0, 0, 0, 0, 0); + if( ($tc_NonTr==0) || ($tc_Tr==0) ) { $ratio_tc = 0; $ratio_ag = 0; $percent_tc_NonTr = 0; $percent_tc_Tr = 0; } + else + { + $ratio_tc = $tc_NonTr/$tc_Tr; $ratio_tc = sprintf("%.2f", $ratio_tc); + $ratio_ag = $tc_Tr/$tc_NonTr; $ratio_ag = sprintf("%.2f", $ratio_ag); + $percent_tc_NonTr = ($tc_NonTr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; $percent_tc_Tr = ($tc_Tr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; + } + print SB "T>C\tNonTranscribed\t$tc_NonTr\n", "T>C\tTranscribed\t$tc_Tr\n"; + # T>C + $ws->write(34, 11, "T>C", $table_left); $ws->write(34, 12, $ratio_tc, $table_middleHeader); $ws->write(34, 13, $tc_NonTr, $format_A10); $ws->write(34, 14, $tc_Tr, $format_A10); + # Write in italic and red (= warning message) when the count of NonTr + Tr is lower than 10 + if(($tc_NonTr+$tc_Tr)< 10) + { + if($h_chi2{$k_file}{'T>C'}{'p-value'} eq "NA") { $ws->write_string(34, 15, $h_chi2{$k_file}{'T>C'}{'p-value'}, $format_A10); } + else { $ws->write_string(34, 15, $h_chi2{$k_file}{'T>C'}{'p-value'}, $format_A10ItalicRed); } + } + else { $ws->write_string(34, 15, $h_chi2{$k_file}{'T>C'}{'p-value'}, $format_A10); } + $ws->write(34, 16, $h_chi2{$k_file}{'T>C'}{'FDR'}, $format_A10); $ws->write(34, 17, $h_chi2{$k_file}{'T>C'}{'ConfInt'}, $table_right); + # A>G + $ws->write(45, 11, "A>G", $table_left); $ws->write(45, 12, $ratio_ag, $table_middleHeader); $ws->write(45, 13, $tc_Tr, $format_A10); $ws->write(45, 14, $tc_NonTr, $format_A10); + if(($tc_NonTr+$tc_Tr)< 10) + { + if($h_chi2{$k_file}{'T>C'}{'p-value'} eq "NA") { $ws->write_string(45, 15, $h_chi2{$k_file}{'T>C'}{'p-value'}, $format_A10); } + else { $ws->write_string(45, 15, $h_chi2{$k_file}{'T>C'}{'p-value'}, $format_A10ItalicRed); } + } + else { $ws->write_string(45, 15, $h_chi2{$k_file}{'T>C'}{'p-value'}, $format_A10); } + $ws->write(45, 16, $h_chi2{$k_file}{'T>C'}{'FDR'}, $format_A10); $ws->write(45, 17, $h_chi2{$k_file}{'T>C'}{'ConfInt'}, $table_right); + + #-----------------------------------------------------------------------------------------------------# + my ($ratio_tg, $ratio_ac, $percent_tg_NonTr, $percent_tg_Tr) = (0, 0, 0, 0, 0); + if( ($tg_NonTr==0) || ($tg_Tr==0) ) { $ratio_tg = 0; $ratio_ac = 0; $percent_tg_NonTr = 0; $percent_tg_Tr = 0; } + else + { + $ratio_tg = $tg_NonTr/$tg_Tr; $ratio_tg = sprintf("%.2f", $ratio_tg); + $ratio_ac = $tg_Tr/$tg_NonTr; $ratio_ac = sprintf("%.2f", $ratio_ac); + $percent_tg_NonTr = ($tg_NonTr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; $percent_tg_Tr = ($tg_Tr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; + } + print SB "T>G\tNonTranscribed\t$tg_NonTr\n", "T>G\tTranscribed\t$tg_Tr\n"; + # T>G + $ws->write(35, 11, "T>G", $table_bottomleft); $ws->write(35, 12, $ratio_tg, $table_middleHeader2); $ws->write(35, 13, $tg_NonTr, $table_bottom); $ws->write(35, 14, $tg_Tr, $table_bottom); + # Write in italic and red (= warning message) when the count of NonTr + Tr is lower than 10 + if(($tg_NonTr+$tg_Tr)< 10) + { + if($h_chi2{$k_file}{'T>G'}{'p-value'} eq "NA") { $ws->write_string(35, 15, $h_chi2{$k_file}{'T>G'}{'p-value'}, $table_bottom); } + else { $ws->write_string(35, 15, $h_chi2{$k_file}{'T>G'}{'p-value'}, $table_bottomItalicRed); } + } + else { $ws->write_string(35, 15, $h_chi2{$k_file}{'T>G'}{'p-value'}, $table_bottom); } + $ws->write(35, 16, $h_chi2{$k_file}{'T>G'}{'FDR'}, $table_bottom); $ws->write(35, 17, $h_chi2{$k_file}{'T>G'}{'ConfInt'}, $table_bottomRight); + # A>C + $ws->write(46, 11, "A>C", $table_bottomleft); $ws->write(46, 12, $ratio_ac, $table_middleHeader2); $ws->write(46, 13, $tg_Tr, $table_bottom); $ws->write(46, 14, $tg_NonTr, $table_bottom); + if(($tg_NonTr+$tg_Tr)< 10) + { + if($h_chi2{$k_file}{'T>G'}{'p-value'} eq "NA") { $ws->write_string(46, 15, $h_chi2{$k_file}{'T>G'}{'p-value'}, $table_bottom); } + else { $ws->write_string(46, 15, $h_chi2{$k_file}{'T>G'}{'p-value'}, $table_bottomItalicRed); } + } + else { $ws->write_string(46, 15, $h_chi2{$k_file}{'T>G'}{'p-value'}, $table_bottom); } + $ws->write(46, 16, $h_chi2{$k_file}{'T>G'}{'FDR'}, $table_bottom); $ws->write(46, 17, $h_chi2{$k_file}{'T>G'}{'ConfInt'}, $table_bottomRight); + + ### Write a warning message when NonTr+Tr < 10 + my $format_italic_red = $wb->add_format(font=>'Arial', size=>10, italic=>1, color => 'red'); + + if( (($ca_NonTr+$ca_Tr)< 10) || (($cg_NonTr+$cg_Tr)< 10) || (($ct_NonTr+$ct_Tr)< 10) || (($ta_NonTr+$ta_Tr)< 10) || (($tc_NonTr+$tc_Tr)< 10) || (($tg_NonTr+$tg_Tr)< 10) ) + { + $ws->write(36, 11, "Warning message: chi-squared approximation may be incorrect because the number of SBS", $format_italic_red); + $ws->write(37, 11, "on Non-transcribed and transcribed strand is lower than 10", $format_italic_red); + } + + close SB; + + + ########################################################################################################################################################### + ################################################################### Write SBS Per Chr ##################################################################### + ########################################################################################################################################################### + # For the HTML report + open(SBSPerChr, ">", "$folderFigure/Distribution_SBS_Per_Chromosomes/$k_file-DistributionSNVS_per_chromosome.txt") or die "$!: $folderFigure/Distribution_SBS_Per_Chromosomes/$k_file-DistributionSNVS_per_chromosome.txt\n"; + print SBSPerChr "\tPearson\t$refH_file->{$k_file}{'SBSPerChr'}{'AllMutType'}\t", $refH_file->{$k_file}{'SBSPerChr'}{"C:G>A:T"}{'Pearson'},"\t", $refH_file->{$k_file}{'SBSPerChr'}{"C:G>G:C"}{'Pearson'},"\t", $refH_file->{$k_file}{'SBSPerChr'}{"C:G>T:A"}{'Pearson'},"\t", $refH_file->{$k_file}{'SBSPerChr'}{"T:A>A:T"}{'Pearson'},"\t", $refH_file->{$k_file}{'SBSPerChr'}{"T:A>C:G"}{'Pearson'},"\t", $refH_file->{$k_file}{'SBSPerChr'}{"T:A>G:C"}{'Pearson'},"\n"; + print SBSPerChr "Chr\tSize\tAll SBS\tC:G>A:T\tC:G>G:C\tC:G>T:A\tT:A>A:T\tT:A>C:G\tT:A>G:C\n"; + + my $row_SBSPerChr = $row_SBSDistrBySegAndFunc_CG + 8; # Line 158 + + # Write the Pearson coefficient + $ws->write($row_SBSDistrBySegAndFunc_CG+6, $colStart_SBSdistrBySeg+3, $refH_file->{$k_file}{'SBSPerChr'}{"C:G>A:T"}{'Pearson'}, $format_A10); + $ws->write($row_SBSDistrBySegAndFunc_CG+6, $colStart_SBSdistrBySeg+4, $refH_file->{$k_file}{'SBSPerChr'}{"C:G>G:C"}{'Pearson'}, $format_A10); + $ws->write($row_SBSDistrBySegAndFunc_CG+6, $colStart_SBSdistrBySeg+5, $refH_file->{$k_file}{'SBSPerChr'}{"C:G>T:A"}{'Pearson'}, $format_A10); + $ws->write($row_SBSDistrBySegAndFunc_CG+6, $colStart_SBSdistrBySeg+6, $refH_file->{$k_file}{'SBSPerChr'}{"T:A>A:T"}{'Pearson'}, $format_A10); + $ws->write($row_SBSDistrBySegAndFunc_CG+6, $colStart_SBSdistrBySeg+7, $refH_file->{$k_file}{'SBSPerChr'}{"T:A>C:G"}{'Pearson'}, $format_A10); + $ws->write($row_SBSDistrBySegAndFunc_CG+6, $colStart_SBSdistrBySeg+8, $refH_file->{$k_file}{'SBSPerChr'}{"T:A>G:C"}{'Pearson'}, $formatT_right); + + # Write the chromosome number and their sizes / Write the total of SBS per chromosome + my $line=0; + + foreach my $chromosome (sort keys %chromosomes) + { + $ws->write($row_SBSPerChr+($line), $colStart_SBSdistrBySeg, $chromosome, $formatT_left); + $ws->write($row_SBSPerChr+($line), $colStart_SBSdistrBySeg+1, $chromosomes{$chromosome}, $format_A10); + $ws->write($row_SBSPerChr+($line), $colStart_SBSdistrBySeg+2, $refH_file->{$k_file}{'SBSPerChr'}{'TotalPerChr'}{$chromosome}{'chr'}, $format_A10); + + # Write the count per mutation + $ws->write($row_SBSPerChr+($line), $colStart_SBSdistrBySeg+3, $refH_file->{$k_file}{'SBSPerChr'}{"C:G>A:T"}{'CHR'}{$chromosome}{'chr'}, $format_A10); + $ws->write($row_SBSPerChr+($line), $colStart_SBSdistrBySeg+4, $refH_file->{$k_file}{'SBSPerChr'}{"C:G>G:C"}{'CHR'}{$chromosome}{'chr'}, $format_A10); + $ws->write($row_SBSPerChr+($line), $colStart_SBSdistrBySeg+5, $refH_file->{$k_file}{'SBSPerChr'}{"C:G>T:A"}{'CHR'}{$chromosome}{'chr'}, $format_A10); + $ws->write($row_SBSPerChr+($line), $colStart_SBSdistrBySeg+6, $refH_file->{$k_file}{'SBSPerChr'}{"T:A>A:T"}{'CHR'}{$chromosome}{'chr'}, $format_A10); + $ws->write($row_SBSPerChr+($line), $colStart_SBSdistrBySeg+7, $refH_file->{$k_file}{'SBSPerChr'}{"T:A>C:G"}{'CHR'}{$chromosome}{'chr'}, $format_A10); + $ws->write($row_SBSPerChr+($line), $colStart_SBSdistrBySeg+8, $refH_file->{$k_file}{'SBSPerChr'}{"T:A>G:C"}{'CHR'}{$chromosome}{'chr'}, $formatT_right); + + + # For the HTML report + print SBSPerChr "$chromosome\t", $chromosomes{$chromosome},"\t", $refH_file->{$k_file}{'SBSPerChr'}{'TotalPerChr'}{$chromosome}{'chr'},"\t", $refH_file->{$k_file}{'SBSPerChr'}{"C:G>A:T"}{'CHR'}{$chromosome}{'chr'},"\t", $refH_file->{$k_file}{'SBSPerChr'}{"C:G>G:C"}{'CHR'}{$chromosome}{'chr'},"\t", $refH_file->{$k_file}{'SBSPerChr'}{"C:G>T:A"}{'CHR'}{$chromosome}{'chr'},"\t", $refH_file->{$k_file}{'SBSPerChr'}{"T:A>A:T"}{'CHR'}{$chromosome}{'chr'},"\t", $refH_file->{$k_file}{'SBSPerChr'}{"T:A>C:G"}{'CHR'}{$chromosome}{'chr'},"\t", $refH_file->{$k_file}{'SBSPerChr'}{"T:A>G:C"}{'CHR'}{$chromosome}{'chr'},"\n"; + $line++; + } + + # Write the Pearson coefficient for the total number of SBS + $ws->write($row_SBSDistrBySegAndFunc_CG+6, $colStart_SBSdistrBySeg+2, $refH_file->{$k_file}{'SBSPerChr'}{'AllMutType'}, $format_A10); + $ws->write($row_SBSPerChr+(keys %chromosomes), $colStart_SBSdistrBySeg+2, $refH_file->{$k_file}{'TotalSBSGenomic'}, $formatT_bottomHeader); + + print SBSPerChr "\t\t$refH_file->{$k_file}{'TotalSBSGenomic'}\n"; + close SBSPerChr; + + + + ########################################################################################################################################################### + ####################################################################### Impact on protein ################################################################# + ########################################################################################################################################################### + $ws->write(29, 6, "Table 2. Frequency and counts of functional impact", $format_A10Boldleft); + $ws->set_column(6, 6, 13); $ws->set_column(10, 10, 15); + $ws->write(30, 6, "RefSeq gene", $table_topleft); $ws->write(30, 7, "", $table_top); $ws->write(30, 8, "Percent", $table_top); $ws->write(30, 9, "Count", $table_topRight); + my $lImpactSBS = 31; + open(IMPACTSBS, ">", "$folderFigure/Impact_protein_sequence/$k_file/$k_file-DistributionExoFunc.txt") or die "$!: $folderFigure/Impact_protein_sequence/$k_file/$k_file-DistributionExoFunc.txt\n"; + print IMPACTSBS "AA_Change\tCount\tPercent\n"; + + # Pie chart with the distribution of SBS vs Indel + open(SBSINDEL, ">", "$folderFigure/Overall_mutation_distribution/$k_file/$k_file-OverallMutationDistribution.txt") or die "$!: $folderFigure/Overall_mutation_distribution/$k_file/$k_file-OverallMutationDistribution.txt\n"; + print SBSINDEL "Variant_type\tCount\tPercent\n"; + my ($deletion, $insertion) = (0, 0); + + + foreach my $k_exoFunc(sort keys $refH_file->{$k_file}{'ImpactSBS'}) + { + my $percent = ($refH_file->{$k_file}{'ImpactSBS'}{$k_exoFunc} / $refH_file->{$k_file}{'TotalMutGenomic'})*100; + $percent = sprintf("%.2f", $percent); + + if($k_exoFunc eq "NA") { print IMPACTSBS "Not_Applicable\t$percent\t$refH_file->{$k_file}{'ImpactSBS'}{$k_exoFunc}\n"; } + else { my $temp = $k_exoFunc; $temp =~ s/ /_/g; print IMPACTSBS "$temp\t$percent\t$refH_file->{$k_file}{'ImpactSBS'}{$k_exoFunc}\n"; } + + $ws->write($lImpactSBS, 6, $k_exoFunc, $table_left2); $ws->write($lImpactSBS, 8, $percent, $format_A10); $ws->write($lImpactSBS, 9, $refH_file->{$k_file}{'ImpactSBS'}{$k_exoFunc}, $table_right); + $lImpactSBS++; + + # Pie chart with the distribution of SBS vs Indel + if($k_exoFunc =~ /deletion/i) { $deletion += $refH_file->{$k_file}{'ImpactSBS'}{$k_exoFunc}; } + elsif($k_exoFunc =~ /insertion/i) { $insertion += $refH_file->{$k_file}{'ImpactSBS'}{$k_exoFunc}; } + } + close IMPACTSBS; + $ws->write($lImpactSBS, 9, $refH_file->{$k_file}{'TotalMutGenomic'}, $table_bottomrightHeader); + $ws->write($lImpactSBS, 6, "", $table_bottomleft); $ws->write($lImpactSBS, 7, "", $table_bottom); $ws->write($lImpactSBS, 8, "", $table_bottom); + + # Pie chart with the distribution of SBS vs Indel + my $percentSBSIndel = ($deletion/$refH_file->{$k_file}{'TotalMutGenomic'})*100; $percentSBSIndel = sprintf("%.2f", $percentSBSIndel); + print SBSINDEL "Deletion\t$deletion\t$percentSBSIndel\n"; + $percentSBSIndel = ($insertion/$refH_file->{$k_file}{'TotalMutGenomic'})*100; $percentSBSIndel = sprintf("%.2f", $percentSBSIndel); + print SBSINDEL "Insertion\t$insertion\t$percentSBSIndel\n"; + $percentSBSIndel = ($refH_file->{$k_file}{TotalSBSGenomic}/$refH_file->{$k_file}{'TotalMutGenomic'})*100; $percentSBSIndel = sprintf("%.2f", $percentSBSIndel); + print SBSINDEL "SBS\t$refH_file->{$k_file}{TotalSBSGenomic}\t$percentSBSIndel\n"; + close SBSINDEL; + + ########################################################################################################################################################### + ######################################################## SEQUENCE CONTEXT ON GENOMIC STRAND ############################################################### + ########################################################################################################################################################### + my $row_SeqContext6 = 4; + # Count the total of mutations for 6 mutation types on genomic strand + my ($c_ca6_g, $c_cg6_g, $c_ct6_g, $c_ta6_g, $c_tc6_g, $c_tg6_g) = (0,0,0, 0,0,0); + my ($p_ca6_g, $p_cg6_g, $p_ct6_g, $p_ta6_g, $p_tc6_g, $p_tg6_g) = (0,0,0, 0,0,0); + my $maxValue = 0; # For the heatmap + + # For checking if the total number of SBS is correct + my $total_SBS_genomic = 0; + + + open(HEATMAPCGENOMIC, ">", "$folderFigure/Trinucleotide_Sequence_Context/$k_file/$k_file-HeatmapCount-Genomic.txt") or die "$!: $folderFigure/Trinucleotide_Sequence_Context/$k_file/$k_file-HeatmapCount-Genomic.txt\n"; + print HEATMAPCGENOMIC "\tC>A\tC>G\tC>T\tT>A\tT>C\tT>G\n"; + open(HEATMAPPGENOMIC, ">", "$folderFigure/Trinucleotide_Sequence_Context/$k_file/$k_file-HeatmapPercent-Genomic.txt") or die "$!: $folderFigure/Trinucleotide_Sequence_Context/$k_file/$k_file-HeatmapPercent-Genomic.txt\n"; + print HEATMAPPGENOMIC "\tC>A\tC>G\tC>T\tT>A\tT>C\tT>G\n"; + + ## Bar plot NMF like + open(BARPLOTNMFLIKE, ">", "$folderFigure/Trinucleotide_Sequence_Context/$k_file/$k_file-MutationSpectraPercent-Genomic.txt") or die "$!: $folderFigure/Trinucleotide_Sequence_Context/$k_file/$k_file-MutationSpectraPercent-Genomic.txt\n"; + print BARPLOTNMFLIKE "alteration\tcontext\tvalue\n"; + + foreach my $k_context (sort keys $refH_file->{$k_file}{'SeqContextG'}) + { + if( ($k_context =~ /N/) || (length($k_context) != 3) ) { next; } + + # Write the context: 6 mut type on genomic strand + $ws->write($row_SeqContext6 , $colStart_matrixSeqContext+3, $k_context, $format_A10); $ws->write($row_SeqContext6 , $colStart_matrixSeqContext+13, $k_context, $format_A10); + + foreach my $k_mutation (sort keys $refH_file->{$k_file}{'SeqContextG'}{$k_context}) + { + # For checking the total number of SBS + $total_SBS_genomic += $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}; + + # Calculate the percentages + my $percent = 0; + if($refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation} == 0) { $percent = 0; } + else + { + $percent = ($refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation} / $refH_file->{$k_file}{'TotalSBSGenomic'}) * 100; + $percent = sprintf("%.2f", $percent); + } + + # For representing the sequence context with a bar plot (NMF like style) + print BARPLOTNMFLIKE $k_mutation,"\t", $k_context,"\t", $percent,"\n"; + + if($k_mutation eq "C>A") + { + ### COUNT + $ws->write($row_SeqContext6, $colStart_matrixSeqContext+4, $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}, $format_A10); + # Write the count for the heatmap + print HEATMAPCGENOMIC "$k_context\t$refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}\t"; + + ### PERCENTAGE + $ws->write($row_SeqContext6, $colStart_matrixSeqContext+14, $percent, $format_A10); + print HEATMAPPGENOMIC "$k_context\t$percent\t"; + + # For NMF input + my $count = $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}; + if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Count'}{$k_context}{'C>A'}}, $count); } + if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Percent'}{$k_context}{'C>A'}}, $percent); } + + # For the heatmap + if($percent >= $maxValue) { $maxValue = $percent; } + + # For the total amount per mutation types + $c_ca6_g += $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}; + $p_ca6_g += $percent; + } + if($k_mutation eq "C>G") + { + ### COUNT + $ws->write($row_SeqContext6, $colStart_matrixSeqContext+5, $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}, $format_A10); + # Write the count for the heatmap + print HEATMAPCGENOMIC "$refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}\t"; + + ### PERCENTAGE + $ws->write($row_SeqContext6, $colStart_matrixSeqContext+15, $percent, $format_A10); + print HEATMAPPGENOMIC "$percent\t"; + + # For NMF input + my $count = $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}; + if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Count'}{$k_context}{'C>G'}}, $count); } + if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Percent'}{$k_context}{'C>G'}}, $percent); } + + # For the heatmap + if($percent >= $maxValue) { $maxValue = $percent; } + + # For the total amount per mutation types + $c_cg6_g += $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}; + $p_cg6_g += $percent; + } + if($k_mutation eq "C>T") + { + ### COUNT + $ws->write($row_SeqContext6, $colStart_matrixSeqContext+6, $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}, $format_A10); + # Write the count for the heatmap + print HEATMAPCGENOMIC "$refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}\t"; + + ### PERCENTAGE + $ws->write($row_SeqContext6, $colStart_matrixSeqContext+16, $percent, $format_A10); + print HEATMAPPGENOMIC "$percent\t"; + + # For NMF input + my $count = $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}; + if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Count'}{$k_context}{'C>T'}}, $count); } + if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Percent'}{$k_context}{'C>T'}}, $percent); } + + # For the heatmap + if($percent >= $maxValue) { $maxValue = $percent; } + + # For the total amount per mutation types + $c_ct6_g += $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}; + $p_ct6_g += $percent; + } + if($k_mutation eq "T>A") + { + ### COUNT + $ws->write($row_SeqContext6, $colStart_matrixSeqContext+7, $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}, $format_A10); + # Write the count for the heatmap + print HEATMAPCGENOMIC "$refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}\t"; + + ### PERCENTAGE + $ws->write($row_SeqContext6, $colStart_matrixSeqContext+17, $percent, $format_A10); + print HEATMAPPGENOMIC "$percent\t"; + + # For NMF input + my $count = $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}; + if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Count'}{$k_context}{'T>A'}}, $count); } + if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Percent'}{$k_context}{'T>A'}}, $percent); } + + # For the heatmap + if($percent >= $maxValue) { $maxValue = $percent; } + + # For the total amount per mutation types + $c_ta6_g += $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}; + $p_ta6_g += $percent; + } + if($k_mutation eq "T>C") + { + ### COUNT + $ws->write($row_SeqContext6, $colStart_matrixSeqContext+8, $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}, $format_A10); + # Write the count for the heatmap + print HEATMAPCGENOMIC "$refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}\t"; + + ### PERCENTAGE + $ws->write($row_SeqContext6, $colStart_matrixSeqContext+18, $percent, $format_A10); + print HEATMAPPGENOMIC "$percent\t"; + + # For NMF input + my $count = $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}; + if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Count'}{$k_context}{'T>C'}}, $count); } + if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Percent'}{$k_context}{'T>C'}}, $percent); } + + # For the heatmap + if($percent >= $maxValue) { $maxValue = $percent; } + + # For the total amount per mutation types + $c_tc6_g += $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}; + $p_tc6_g += $percent; + } + if($k_mutation eq "T>G") + { + ### COUNT + $ws->write($row_SeqContext6, $colStart_matrixSeqContext+9, $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}, $format_A10); + # Write the count for the heatmap + print HEATMAPCGENOMIC "$refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}\n"; + + ### PERCENTAGE + $ws->write($row_SeqContext6, $colStart_matrixSeqContext+19, $percent, $format_A10); + print HEATMAPPGENOMIC "$percent\n"; + + # For NMF input + my $count = $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}; + if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Count'}{$k_context}{'T>G'}}, $count); } + if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Percent'}{$k_context}{'T>G'}}, $percent); } + + # For the heatmap + if($percent >= $maxValue) { $maxValue = $percent; } + + # For the total amount per mutation types + $c_tg6_g += $refH_file->{$k_file}{'SeqContextG'}{$k_context}{$k_mutation}; + $p_tg6_g += $percent; + } + } + $row_SeqContext6++; + } + close HEATMAPCGENOMIC; close HEATMAPPGENOMIC; + close BARPLOTNMFLIKE; + + + # Write the total number of SBS per mutation type: COUNT + $ws->write($row_SeqContext6, $colStart_matrixSeqContext+4, $c_ca6_g, $formatT_bottomHeader2); $ws->write($row_SeqContext6, $colStart_matrixSeqContext+5, $c_cg6_g, $formatT_bottomHeader2); $ws->write($row_SeqContext6, $colStart_matrixSeqContext+6, $c_ct6_g, $formatT_bottomHeader2); + $ws->write($row_SeqContext6, $colStart_matrixSeqContext+7, $c_ta6_g, $formatT_bottomHeader2); $ws->write($row_SeqContext6, $colStart_matrixSeqContext+8, $c_tc6_g, $formatT_bottomHeader2); $ws->write($row_SeqContext6, $colStart_matrixSeqContext+9, $c_tg6_g, $formatT_bottomHeader2); + if($total_SBS_genomic != $refH_file->{$k_file}{'TotalSBSGenomic'}) { print STDERR "Error in the calculation of the total number of SBS on the genomic strand!!!!\nFrom hash table $refH_file->{$k_file}{'TotalSBSGenomic'}\tVS\t$total_SBS_genomic\n"; exit; } + + # Write the total number of SBS per mutation type: PERCENT + $ws->write($row_SeqContext6, $colStart_matrixSeqContext+14, $p_ca6_g, $formatT_bottomHeader2); $ws->write($row_SeqContext6, $colStart_matrixSeqContext+15, $p_cg6_g, $formatT_bottomHeader2); $ws->write($row_SeqContext6, $colStart_matrixSeqContext+16, $p_ct6_g, $formatT_bottomHeader2); + $ws->write($row_SeqContext6, $colStart_matrixSeqContext+17, $p_ta6_g, $formatT_bottomHeader2); $ws->write($row_SeqContext6, $colStart_matrixSeqContext+18, $p_tc6_g, $formatT_bottomHeader2); $ws->write($row_SeqContext6, $colStart_matrixSeqContext+19, $p_tg6_g, $formatT_bottomHeader2); + my $totalPercent_genomic = $p_ca6_g + $p_cg6_g + $p_ct6_g + $p_ta6_g + $p_tc6_g + $p_tg6_g; $totalPercent_genomic = sprintf("%.0f", $totalPercent_genomic); + if($totalPercent_genomic != 100) { print STDERR "Error in the calculation of the total percentages on the genomic strand!!!\nThe total is equal to=\t$totalPercent_genomic\n"; exit; } + + + #----------------------------------------------------------------------------------------------------------------------------------------------------------------# + # For the input matrix for NMF + if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Sample'}}, $k_file); } + + + ########################################################################################################################################################### + ######################################################## SEQUENCE CONTEXT ON CODING STRAND ############################################################### + ########################################################################################################################################################### + my $row_SeqContext12 = $rowStart_SBSdistrBySeg+6; my $row_SeqContext12Percent = $rowStart_SBSdistrBySeg+27; + # Reset the total count and percent calculated for the strand bias + ($ca_NonTr, $ca_Tr, $cg_NonTr, $cg_Tr, $ct_NonTr, $ct_Tr, $ta_NonTr, $ta_Tr, $tc_NonTr, $tc_Tr, $tg_NonTr, $tg_Tr) = (0,0,0, 0,0,0, 0,0,0, 0,0,0); + ($percent_ca_NonTr, $percent_ca_Tr, $percent_cg_NonTr, $percent_cg_Tr, $percent_ct_NonTr, $percent_ct_Tr, $percent_ta_NonTr, $percent_ta_Tr, $percent_tc_NonTr, $percent_tc_Tr, $percent_tg_NonTr, $percent_tg_Tr) = (0,0,0, 0,0,0, 0,0,0, 0,0,0); + + # For checking if the total number of SBS is correct + my $total_SBS_coding = 0; + + open(COUNT, ">", "$folderFigure/Stranded_Analysis/$k_file/$k_file-StrandedSignatureCount.txt") or die "$!: $folderFigure/Stranded_Analysis/$k_file/$k_file-StrandedSignatureCount.txt\n"; + print COUNT "MutationTypeContext\tStrand\tValue\tSample\n"; + open(PERCENT, ">", "$folderFigure/Stranded_Analysis/$k_file/$k_file-StrandedSignaturePercent.txt") or die "$!: $folderFigure/Stranded_Analysis/$k_file/$k_file-StrandedSignaturePercent.txt\n"; + print PERCENT "MutationTypeContext\tStrand\tValue\tSample\n"; + + foreach my $k_context (sort keys $refH_file->{$k_file}{'SeqContextC'}) + { + if( ($k_context =~ /N/) || (length($k_context) != 3) ) { next; } + + # Write the context: 12 mut type on coding strand + $ws->write($row_SeqContext12 , $colStart_matrixSeqContext, $k_context, $formatT_left); $ws->write($row_SeqContext12Percent , $colStart_matrixSeqContext, $k_context, $formatT_left); + + foreach my $k_mutation (sort keys $refH_file->{$k_file}{'SeqContextC'}{$k_context}) + { + # Percent: 12 mut type on coding strand + my ($percent_NonTr, $percent_Tr) = (0, 0); + if($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'} == 0) { $percent_NonTr = 0; } + else { $percent_NonTr = ( $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'} / $refH_file->{$k_file}{'TotalSBSCoding'} ) * 100 } + if($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'} == 0) { $percent_Tr = 0; } + else { $percent_Tr = ( $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'} / $refH_file->{$k_file}{'TotalSBSCoding'} ) * 100 } + + + # Calculate the total number for each mutation types + if($k_mutation eq "C>A") + { + $ca_NonTr += $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}; + $ca_Tr += $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}; + + # COUNT : 12 mutation type (stranded bar graph) + $ws->write($row_SeqContext12, $colStart_matrixSeqContext+1, $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}, $format_A10); + print COUNT "$k_mutation:$k_context\tNonTranscribed\t$refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}\t$k_file\n"; + $ws->write($row_SeqContext12, $colStart_matrixSeqContext+2, $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}, $format_A10); + print COUNT "$k_mutation:$k_context\tTranscribed\t$refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}\t$k_file\n"; + + ## PERCENT : 12 mutation type (stranded bar graph) + my $percent_NonTr = ($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}/$refH_file->{$k_file}{'TotalSBSCoding'})*100; + $percent_NonTr = sprintf("%.2f", $percent_NonTr); $percent_ca_NonTr += $percent_NonTr; + my $percent_Tr = ($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}/$refH_file->{$k_file}{'TotalSBSCoding'})*100; + $percent_Tr = sprintf("%.2f", $percent_Tr); $percent_ca_Tr += $percent_Tr; + print PERCENT "$k_mutation:$k_context\tNonTranscribed\t$percent_NonTr\t$k_file\n"; + print PERCENT "$k_mutation:$k_context\tTranscribed\t$percent_Tr\t$k_file\n"; + + $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+1, $percent_NonTr, $format_A10); + $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+2, $percent_Tr, $format_A10); + } + if($k_mutation eq "C>G") + { + $cg_NonTr += $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}; + $cg_Tr += $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}; + + # COUNT : 12 mutation type (stranded bar graph) + $ws->write($row_SeqContext12, $colStart_matrixSeqContext+3, $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}, $format_A10); + print COUNT "$k_mutation:$k_context\tNonTranscribed\t$refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}\t$k_file\n"; + $ws->write($row_SeqContext12, $colStart_matrixSeqContext+4, $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}, $format_A10); + print COUNT "$k_mutation:$k_context\tTranscribed\t$refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}\t$k_file\n"; + + ## PERCENT : 12 mutation type (stranded bar graph) + my $percent_NonTr = ($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}/$refH_file->{$k_file}{'TotalSBSCoding'})*100; + $percent_NonTr = sprintf("%.2f", $percent_NonTr); $percent_cg_NonTr += $percent_NonTr; + my $percent_Tr = ($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}/$refH_file->{$k_file}{'TotalSBSCoding'})*100; + $percent_Tr = sprintf("%.2f", $percent_Tr); $percent_cg_Tr += $percent_Tr; + print PERCENT "$k_mutation:$k_context\tNonTranscribed\t$percent_NonTr\t$k_file\n"; + print PERCENT "$k_mutation:$k_context\tTranscribed\t$percent_Tr\t$k_file\n"; + + $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+3, $percent_NonTr, $format_A10); + $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+4, $percent_Tr, $format_A10); + } + if($k_mutation eq "C>T") + { + $ct_NonTr += $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}; + $ct_Tr += $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}; + + # COUNT : 12 mutation type (stranded bar graph) + $ws->write($row_SeqContext12, $colStart_matrixSeqContext+5, $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}, $format_A10); + print COUNT "$k_mutation:$k_context\tNonTranscribed\t$refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}\t$k_file\n"; + $ws->write($row_SeqContext12, $colStart_matrixSeqContext+6, $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}, $format_A10); + print COUNT "$k_mutation:$k_context\tTranscribed\t$refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}\t$k_file\n"; + + ## PERCENT : 12 mutation type (stranded bar graph) + my $percent_NonTr = ($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}/$refH_file->{$k_file}{'TotalSBSCoding'})*100; + $percent_NonTr = sprintf("%.2f", $percent_NonTr); $percent_ct_NonTr += $percent_NonTr; + my $percent_Tr = ($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}/$refH_file->{$k_file}{'TotalSBSCoding'})*100; + $percent_Tr = sprintf("%.2f", $percent_Tr); $percent_ct_Tr += $percent_Tr; + print PERCENT "$k_mutation:$k_context\tNonTranscribed\t$percent_NonTr\t$k_file\n"; + print PERCENT "$k_mutation:$k_context\tTranscribed\t$percent_Tr\t$k_file\n"; + + $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+5, $percent_NonTr, $format_A10); + $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+6, $percent_Tr, $format_A10); + } + if($k_mutation eq "T>A") + { + $ta_NonTr += $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}; + $ta_Tr += $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}; + + # COUNT : 12 mutation type (stranded bar graph) + $ws->write($row_SeqContext12, $colStart_matrixSeqContext+7, $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}, $format_A10); + print COUNT "$k_mutation:$k_context\tNonTranscribed\t$refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}\t$k_file\n"; + $ws->write($row_SeqContext12, $colStart_matrixSeqContext+8, $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}, $format_A10); + print COUNT "$k_mutation:$k_context\tTranscribed\t$refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}\t$k_file\n"; + + ## PERCENT : 12 mutation type (stranded bar graph) + my $percent_NonTr = ($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}/$refH_file->{$k_file}{'TotalSBSCoding'})*100; + $percent_NonTr = sprintf("%.2f", $percent_NonTr); $percent_ta_NonTr += $percent_NonTr; + my $percent_Tr = ($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}/$refH_file->{$k_file}{'TotalSBSCoding'})*100; + $percent_Tr = sprintf("%.2f", $percent_Tr); $percent_ta_Tr += $percent_Tr; + print PERCENT "$k_mutation:$k_context\tNonTranscribed\t$percent_NonTr\t$k_file\n"; + print PERCENT "$k_mutation:$k_context\tTranscribed\t$percent_Tr\t$k_file\n"; + + $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+7, $percent_NonTr, $format_A10); + $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+8, $percent_Tr, $format_A10); + } + if($k_mutation eq "T>C") + { + $tc_NonTr += $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}; + $tc_Tr += $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}; + + # COUNT : 12 mutation type (stranded bar graph) + $ws->write($row_SeqContext12, $colStart_matrixSeqContext+9, $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}, $format_A10); + print COUNT "$k_mutation:$k_context\tNonTranscribed\t$refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}\t$k_file\n"; + $ws->write($row_SeqContext12, $colStart_matrixSeqContext+10, $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}, $format_A10); + print COUNT "$k_mutation:$k_context\tTranscribed\t$refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}\t$k_file\n"; + + ## PERCENT : 12 mutation type (stranded bar graph) + my $percent_NonTr = ($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}/$refH_file->{$k_file}{'TotalSBSCoding'})*100; + $percent_NonTr = sprintf("%.2f", $percent_NonTr); $percent_tc_NonTr += $percent_NonTr; + my $percent_Tr = ($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}/$refH_file->{$k_file}{'TotalSBSCoding'})*100; + $percent_Tr = sprintf("%.2f", $percent_Tr); $percent_tc_Tr += $percent_Tr; + print PERCENT "$k_mutation:$k_context\tNonTranscribed\t$percent_NonTr\t$k_file\n"; + print PERCENT "$k_mutation:$k_context\tTranscribed\t$percent_Tr\t$k_file\n"; + + $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+9, $percent_NonTr, $format_A10); + $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+10, $percent_Tr, $format_A10); + } + if($k_mutation eq "T>G") + { + $tg_NonTr += $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}; + $tg_Tr += $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}; + + # COUNT : 12 mutation type (stranded bar graph) + $ws->write($row_SeqContext12, $colStart_matrixSeqContext+11, $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}, $format_A10); + print COUNT "$k_mutation:$k_context\tNonTranscribed\t$refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}\t$k_file\n"; + $ws->write($row_SeqContext12, $colStart_matrixSeqContext+12, $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}, $format_A10); + print COUNT "$k_mutation:$k_context\tTranscribed\t$refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}\t$k_file\n"; + + ## PERCENT : 12 mutation type (stranded bar graph) + my $percent_NonTr = ($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}/$refH_file->{$k_file}{'TotalSBSCoding'})*100; + $percent_NonTr = sprintf("%.2f", $percent_NonTr); $percent_tg_NonTr += $percent_NonTr; + my $percent_Tr = ($refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}/$refH_file->{$k_file}{'TotalSBSCoding'})*100; + $percent_Tr = sprintf("%.2f", $percent_Tr); $percent_tg_Tr += $percent_Tr; + print PERCENT "$k_mutation:$k_context\tNonTranscribed\t$percent_NonTr\t$k_file\n"; + print PERCENT "$k_mutation:$k_context\tTranscribed\t$percent_Tr\t$k_file\n"; + + $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+11, $percent_NonTr, $format_A10); + $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+12, $percent_Tr, $format_A10); + } + + # For checking if the total number of SBS is correct + $total_SBS_coding += $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'} + $refH_file->{$k_file}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}; + } + $row_SeqContext12++; $row_SeqContext12Percent++; + } + close COUNT; close PERCENT; + + ## Write the total of each mutation types: 12 mut type on coding strand + $ws->write($row_SeqContext12, $colStart_matrixSeqContext+1, $ca_NonTr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $colStart_matrixSeqContext+2, $ca_Tr, $formatT_bottomHeader2); + $ws->write($row_SeqContext12, $colStart_matrixSeqContext+3, $cg_NonTr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $colStart_matrixSeqContext+4, $cg_Tr, $formatT_bottomHeader2); + $ws->write($row_SeqContext12, $colStart_matrixSeqContext+5, $ct_NonTr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $colStart_matrixSeqContext+6, $ct_Tr, $formatT_bottomHeader2); + $ws->write($row_SeqContext12, $colStart_matrixSeqContext+7, $ta_NonTr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $colStart_matrixSeqContext+8, $ta_Tr, $formatT_bottomHeader2); + $ws->write($row_SeqContext12, $colStart_matrixSeqContext+9, $tc_NonTr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $colStart_matrixSeqContext+10, $tc_Tr, $formatT_bottomHeader2); + $ws->write($row_SeqContext12, $colStart_matrixSeqContext+11, $tg_NonTr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $colStart_matrixSeqContext+12, $tg_Tr, $formatT_bottomHeader2); + # Write the total percentages of each mutation types: 12 mut type on coding strand + $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+1, $percent_ca_NonTr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+2, $percent_ca_Tr, $formatT_bottomHeader); + $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+3, $percent_cg_NonTr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+4, $percent_cg_Tr, $formatT_bottomHeader); + $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+5, $percent_ct_NonTr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+6, $percent_ct_Tr, $formatT_bottomHeader); + $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+7, $percent_ta_NonTr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+8, $percent_ta_Tr, $formatT_bottomHeader); + $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+9, $percent_tc_NonTr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+10, $percent_tc_Tr, $formatT_bottomHeader); + $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+11, $percent_tg_NonTr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $colStart_matrixSeqContext+12, $percent_tg_Tr, $formatT_bottomHeader); + + if($total_SBS_coding == $refH_file->{$k_file}{'TotalSBSCoding'}) { $ws->write($row_SeqContext12, $colStart_matrixSeqContext+13, $refH_file->{$k_file}{'TotalSBSCoding'}, $formatT_bottomHeader2) } + else { print STDERR "Error in the calculation of the total number of SBS on the coding strand!!!!\nFrom hash table $refH_file->{$k_file}{'TotalSBSCoding'}\tVS\t$total_SBS_coding\n"; exit; } + + + my $totalP_SBS_coding = $percent_ca_NonTr + $percent_ca_Tr + $percent_cg_NonTr + $percent_cg_Tr + $percent_ct_NonTr + $percent_ct_Tr + $percent_ta_NonTr + $percent_ta_Tr + $percent_tc_NonTr + $percent_tc_Tr + $percent_tg_NonTr + $percent_tg_Tr; $totalP_SBS_coding = sprintf("%.0f", $totalP_SBS_coding); + if($totalP_SBS_coding != 100) { print STDERR "The percentages for the trinucleotide sequence context on the coding strand for 12 mutation types is not equal to 100!!!\n$totalP_SBS_coding\n"; exit; } + + + ########################################################################################################################################################### + ################################################################### GRAPHS & TABLES ####################################################################### + ########################################################################################################################################################### + Create_Graph($folderFigure, $k_file, $maxValue); + + ## Distribution of SBS into the Excel report (Figure 1 + Table 1) + $ws->write(0, 0, "Graph 1. SBS distribution", $formatT_graphTitle); $ws->set_row(0, 18); + $ws->insert_image(1, 0, "$folder_temp/$k_file-SBS_distribution-Report.png", 0, 0, .2, .2); + $ws->write(29, 0, "Table 1. Frequency and counts of all SBS", $format_A10Boldleft); + $ws->write(30, 0, "Mutation type", $table_topleft); $ws->write(30, 1, "Percentage", $table_top); $ws->write(30, 2, "Count", $table_topRight); + $ws->write(31, 0, "C:G>A:T", $table_left); $ws->write(31, 1, $percent_ca, $format_A10); $ws->write(31, 2, $ca_genomique, $table_right); + $ws->write(32, 0, "C:G>G:C", $table_left); $ws->write(32, 1, $percent_cg, $format_A10); $ws->write(32, 2, $cg_genomique, $table_right); + $ws->write(33, 0, "C:G>T:A", $table_left); $ws->write(33, 1, $percent_ct, $format_A10); $ws->write(33, 2, $ct_genomique, $table_right); + $ws->write(34, 0, "T:A>A:T", $table_left); $ws->write(34, 1, $percent_ta, $format_A10); $ws->write(34, 2, $ta_genomique, $table_right); + $ws->write(35, 0, "T:A>C:G", $table_left); $ws->write(35, 1, $percent_tc, $format_A10); $ws->write(35, 2, $tc_genomique, $table_right); + $ws->write(36, 0, "T:A>G:C", $table_left); $ws->write(36, 1, $percent_tg, $format_A10); $ws->write(36, 2, $tg_genomique, $table_right); + $ws->write(37, 0, "", $table_bottomleft); $ws->write(37, 1, "", $table_bottom); $ws->write(37, 2, $refH_file->{$k_file}{'TotalSBSGenomic'}, $table_bottomrightHeader); + + ## Impact of the SBS on the protein + $ws->write(0, 6, "Graph 2. Impact on protein sequence", $formatT_graphTitle); + $ws->insert_image(1, 6, "$folder_temp/$k_file-DistributionExoFunc-Report.png", 0, 0, .2, .2); + + ## Strand Bias + $ws->write(0, 11, "Graph 3. Stranded distribution of SBS", $formatT_graphTitle); + $ws->insert_image(1, 11, "$folder_temp/$k_file-StrandBias-Report.png", 0, 0, .2, .2); + + ## Stranded signature (Scale the inserted image: width x 0.7, height x 0.8) + $ws->insert_image($rowStart_SBSdistrBySeg+3, $colStart_matrixSeqContext+15, "$folder_temp/$k_file-StrandedSignatureCount-Report.png", 0, 0, .16, .16); + $ws->insert_image($rowStart_SBSdistrBySeg+24, $colStart_matrixSeqContext+15, "$folder_temp/$k_file-StrandedSignaturePercent-Report.png", 0, 0, .16, .16); + + + # Heatamp for the sequence context on the genomic strand (6 mutation types) + $ws->insert_image(4, $colStart_matrixSeqContext, "$folder_temp/$k_file-HeatmapCount-Genomic-Report.png"); + $ws->insert_image(4, $colStart_matrixSeqContext+10, "$folder_temp/$k_file-HeatmapPercent-Genomic-Report.png"); + + + ## Bar plot for representing the sequence context (NMF like style) + `Rscript $pathRScriptMutSpectrum $folderFigure/Trinucleotide_Sequence_Context/$k_file/$k_file-MutationSpectraPercent-Genomic.txt $k_file $folderFigure/Trinucleotide_Sequence_Context/$k_file $folder_temp $c_ca6_g $c_cg6_g $c_ct6_g $c_ta6_g $c_tc6_g $c_tg6_g 2>&1`; + + # Bar plot for the sequence context on the genomic strand (6 mutation types) + $ws->insert_image(27, $colStart_matrixSeqContext+3, "$folder_temp/$k_file-MutationSpectraPercent-Genomic-Report.png"); + + # Next sample + $row_SumSheet++; + } # End $k_file + + #----------------------------------------------------------------------------------------------------------------------------------------------------------------# + # Write the input matrix for NMF + open(OUTINPUTNMFC, ">", "$folderNMF/Input_NMF_Count.txt") or die "$!: $folderNMF/Input_NMF_Count.txt\n"; # with the count + open(OUTINPUTNMFP, ">", "$folderNMF/Input_NMF_Frequency.txt") or die "$!: $folderNMF/Input_NMF_Frequency.txt\n"; # With the frequency un-normalized + + foreach my $k_sample (@{$h_inputNMF{'Sample'}}) { print OUTINPUTNMFC "\t$k_sample"; print OUTINPUTNMFP "\t$k_sample"; } + print OUTINPUTNMFC "\n"; print OUTINPUTNMFP "\n"; + + my $row_inputNMF = 1; + foreach my $k_context (sort keys $h_inputNMF{'Count'}) + { + $k_context =~ /(\w)_(\w)/; my ($base5, $base3) = ($1, $2); + foreach my $k_mutation (sort keys $h_inputNMF{'Count'}{$k_context}) + { + my ($col_inputNMF_Count, $col_inputNMF_Percent) = (1, 1); + my $contextNMF = $base5."[$k_mutation]".$base3; + $ws_inputNMF_count->write($row_inputNMF, 0, $contextNMF); $ws_inputNMF_percent->write($row_inputNMF, 0, $contextNMF); + print OUTINPUTNMFC $contextNMF,"\t"; print OUTINPUTNMFP $contextNMF,"\t"; + + foreach (@{$h_inputNMF{'Count'}{$k_context}{$k_mutation}}) { print OUTINPUTNMFC "$_\t"; } print OUTINPUTNMFC "\n"; + foreach (@{$h_inputNMF{'Percent'}{$k_context}{$k_mutation}}) { print OUTINPUTNMFP "$_\t"; } print OUTINPUTNMFP "\n"; + + foreach (@{$h_inputNMF{'Count'}{$k_context}{$k_mutation}}) + { + # print "\t$k_context\t$k_mutation\t"; + # print "\t$row_inputNMF\t$col_inputNMF_Count\t$_\n"; + $ws_inputNMF_count->write($row_inputNMF, $col_inputNMF_Count, $_); $col_inputNMF_Count++; + } + foreach (@{$h_inputNMF{'Percent'}{$k_context}{$k_mutation}}) { $ws_inputNMF_percent->write($row_inputNMF, $col_inputNMF_Percent, $_); $col_inputNMF_Percent++; } + $row_inputNMF++; + } + } + close OUTINPUTNMFP; close OUTINPUTNMFC; + + + # Close the workbook + $wb->close(); + } + # Calculate the chi2 for the strand bias + sub CalculateChi2 + { + my ($refH_file, $folderChi2) = @_; + + # No value for the chi2 + if(scalar (keys $refH_file) == 0) { print STDERR "No value for calculating the chi2 for the strand bias\n"; exit; } + + # Strand bias for one mutation type for all the samples + my %h_tempchi2 = (); + my ($ca_NonTr, $ca_Tr, $cg_NonTr, $cg_Tr, $ct_NonTr, $ct_Tr, $ta_NonTr, $ta_Tr, $tc_NonTr, $tc_Tr, $tg_NonTr, $tg_Tr) = (0,0,0,0,0,0, 0,0,0,0,0,0); + + my $nb_file = 0; + + foreach my $k_file (sort keys $refH_file) + { + $nb_file++; + foreach my $k_func (sort keys $refH_file->{$k_file}{'6mutType'}) + { + foreach my $k_mutation (sort keys $refH_file->{$k_file}{'6mutType'}{$k_func}) + { + if($k_mutation eq "C:G>A:T") + { + $h_tempchi2{'C>A'}{$k_file}{'NonTr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}; + $h_tempchi2{'C>A'}{$k_file}{'Tr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; + } + if($k_mutation eq "C:G>G:C") + { + $h_tempchi2{'C>G'}{$k_file}{'NonTr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}; + $h_tempchi2{'C>G'}{$k_file}{'Tr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; + } + if($k_mutation eq "C:G>T:A") + { + $h_tempchi2{'C>T'}{$k_file}{'NonTr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}; + $h_tempchi2{'C>T'}{$k_file}{'Tr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; + } + if($k_mutation eq "T:A>A:T") + { + $h_tempchi2{'T>A'}{$k_file}{'NonTr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}; + $h_tempchi2{'T>A'}{$k_file}{'Tr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; + } + if($k_mutation eq "T:A>C:G") + { + $h_tempchi2{'T>C'}{$k_file}{'NonTr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}; + $h_tempchi2{'T>C'}{$k_file}{'Tr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; + } + if($k_mutation eq "T:A>G:C") + { + $h_tempchi2{'T>G'}{$k_file}{'NonTr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}; + $h_tempchi2{'T>G'}{$k_file}{'Tr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; + } + } + } + } + + # Create the input file for NMF + open(CHI2, ">", "$folderChi2/Input_chi2_strandBias.txt") or die "$!: $folderChi2/Input_chi2_strandBias.txt\n"; + print CHI2 "SampleName\tNonTr\tTr\tAlteration\n"; + + foreach my $k_mutation (sort keys %h_tempchi2) + { + foreach my $k_file (sort keys $h_tempchi2{$k_mutation}) + { + print CHI2 "$k_file\t$h_tempchi2{$k_mutation}{$k_file}{'NonTr'}\t$h_tempchi2{$k_mutation}{$k_file}{'Tr'}\t$k_mutation\n"; + } + } + close CHI2; + + + # Open the connection with R + my $R = Statistics::R->new() or die "Impossible to create a communication bridge with R\n"; + + $R->send(qq`## Load the data. There is one column with the mutation type and the sample name but it's just for knowing what is corresponding to each line. The two columns with the number of variant per strand would be sufficient. + strBias<-read.delim("$folderChi2/Input_chi2_strandBias.txt", dec=".");`); + $R->send(q`# Chi2 + pValChi2 <- c() # First I create an empty vector and then I apply a for on the data load + pValChi2_round <- c() # Empty vector with the rounded p-values + confInt <- c() # Empty vector for the confident interval + proportion <- c() # Empty vector for the proportion of NonTr compared to the (NonTr+Tr) + sampleSize <- c() # Empty vector for the count of samples in NonTr and Tr + # For Pool_Data save the p-values in a different vector for not having them for the FDR + pValChi2_PoolData <- c() + pValChi2_PoolData_Round <- c() + + j = 1 # Timer for pValChi2_PoolData vector + k = 1 # Timer for pValChi2 + + for(i in 1:nrow(strBias)) + { + if(! sum(strBias[i,2:3]) == 0) + { + # For Pool_Data + if(strBias[i,1] == "Pool_Data") + { + pValChi2_PoolData[j] <- prop.test(x=strBias[i,2],n=sum(strBias[i,2:3]),p=0.5)$p.value + j <- j+1 + } + # For the other sample(s) + else + { + # Calculate the p-value + pValChi2[k] <- prop.test(x=strBias[i,2],n=sum(strBias[i,2:3]),p=0.5)$p.value + k <- k+1 + } + + # Calculate the confidence interval + temp <- prop.test(x=strBias[i,2],n=sum(strBias[i,2:3]),p=0.5)$conf.int + confInt[i] <- paste0("[", round(temp[1],2), "-", round(temp[2],2), "]") # Same as paste(sep="") + + # Save the proportion + proportion[i] <- strBias[i,2] / sum(strBias[i,2:3]) + + # Save the sample size (count on NonTr and Tr) + sampleSize[i] <- paste(strBias[i,2], strBias[i,3], sep="-") + } else + { + if(strBias[i,1] == "Pool_Data") + { + pValChi2_PoolData[j] <- NA + pValChi2_PoolData_Round[j] <- NA + j <- j+1 + } + else + { + # Not enough effective for the test + pValChi2[k] <- NA + confInt[k] <- NA + proportion[k] <- NA + sampleSize[k] <- NA + pValChi2_round[k] <- NA + k <- k+1 + } + } + } + # Adjust with FDR + FDR<-p.adjust(pValChi2, method="BH") + + # Rount the p-value + for(i in 1:nrow(strBias)) + { + if( (! is.na(pValChi2[i])) && (pValChi2[i] < 0.0001) ) + { + pValChi2_round[i] <- format(pValChi2[i], scientific=T, digits=3) + } else if(! is.na(pValChi2[i])) + { + pValChi2_round[i] <- as.character(round(pValChi2[i], 3)) + } + } + + # The option for the pool is specified + if(!is.null(pValChi2_PoolData)) + { + # Round the p-value for Pool_Data + for(i in 1:6) + { + if( (! is.na(pValChi2_PoolData[i])) && (pValChi2_PoolData[i] < 0.0001) ) + { + pValChi2_PoolData_Round[i] <- format(pValChi2_PoolData[i], scientific=T, digits=3) + } else if(! is.na(pValChi2_PoolData[i])) + { + pValChi2_PoolData_Round[i] <- as.character(round(pValChi2_PoolData[i], 3)) + } + } + } + + + # I create a dataframe for add what I want + outputChi2 <- data.frame(round(strBias[,2]/strBias[,3], digits=2), sampleSize, round(proportion, 3), confInt) + outputChi2$Mut.type <- strBias$Alteration + outputChi2$SampleName <- strBias$SampleName + colnames(outputChi2)[1:6]<-c("Strand_Bias", "NonTr-Tr", "Proportion", "Confidence Interval", "Mutation_Type", "SampleName") + + # Transform the data frame into a matrix for adding the p-value for the samples and Pool_Data + matrix <- as.matrix(outputChi2) + tempColPValFDR <- matrix(, nrow=length(sampleSize), ncol = 2) # Create an empty matrix with 2 columns for adding the p-value and the FDR + matrix <- cbind(matrix, tempColPValFDR) + j = 1 # Timer for all the sample + k = 1 # Timer for Pool_Data + for(i in 1:nrow(matrix)) + { + if(matrix[i,6] == "Pool_Data") + { + matrix[i,7] <- pValChi2_PoolData_Round[k] + matrix[i,8] <- "NA" # No FDR for Pool_Data + k = k+1 + } + else + { + matrix[i,7] <- pValChi2_round[j] + matrix[i,8] <- round(FDR[j], 3) + j = j+1 + } + } + # Reorder the columns + matrix <- cbind(matrix[,1:3], matrix[,7], matrix[,8], matrix[,4:6]) + colnames(matrix)[4] <- "P-val-Chi2" + colnames(matrix)[5] <- "FDR"`); + + $R->send(qq`# Export the file + # dec=".": Set the separator for the decimal by "." + write.table(matrix,file="$folderChi2/Output_chi2_strandBias.txt",quote = FALSE,sep="\t",row.names = FALSE,dec=".");`); + + # Stop the connection with R + $R->stop(); + } + # Pearson correlation + sub PearsonCoefficient + { + our ($refH_file, $filename) = @_; + + #### Calculate the Pearson coefficient + my @total_SBS = (); # Pearson for all mutation types + + # Create a 2D array + foreach my $k_mutation (sort keys $refH_file->{$filename}{'SBSPerChr'}) + { + my $x = []; + my $correlation = 0; + + if($k_mutation eq "AllMutType") { next; } + elsif($k_mutation eq "TotalPerChr") { next; } + elsif($k_mutation eq "ChrSize") { next; } + else + { + my $testZero = 0; # The correlation function doesn't works if all the variables are equal to zero + # generate an anonymous 2D array where $x->[1] is the row + # $x->[1][1] is the value in row 1 column 1 and $x->[1][2] is the value of row 1 column 2 + # once you build the entire array, pass it to the correlation subroutine + my $i=1; + while ( my ($chromosome, $lenght) = each (%chromosomes)) + { + $x->[$i][1] = $lenght; # First column contains the chromosome size + $x->[$i][2] = $refH_file->{$filename}{'SBSPerChr'}{$k_mutation}{'CHR'}{$chromosome}{'chr'}; # Second column contains the count of SBS + if($refH_file->{$filename}{'SBSPerChr'}{$k_mutation}{'CHR'}{$chromosome}{'chr'}==0) { $testZero++; } + $i++; + } + if( $testZero == keys $refH_file->{$filename}{'SBSPerChr'}{$k_mutation}{'CHR'} ) { $correlation = 0; } + # Pass the 2D array to the correlation subroutine + else { $correlation = correlation($x); } + + $refH_file->{$filename}{'SBSPerChr'}{$k_mutation}{'Pearson'} = $correlation; # Pearson per mutation type + } + } + + #generate an anonymous 2D array for all mutation type + my $testZero = 0; + my $x = []; + my $correlation = 0; + my $i=1; + while ( my ($chromosome, $lenght) = each (%chromosomes)) + { + $x->[$i][1] = $lenght; + $x->[$i][2] = $refH_file->{$filename}{'SBSPerChr'}{'TotalPerChr'}{$chromosome}{'chr'}; + $i++; + } + if($testZero == keys $refH_file->{$filename}{'SBSPerChr'}{'TotalPerChr'}) { $correlation = 0; } + else { $correlation = correlation($x); } + # Pass the 2D array to the correlation subroutine + $refH_file->{$filename}{'SBSPerChr'}{'AllMutType'} = $correlation; + + sub correlation + { + my ($x) = @_; + my ($mean_x,$mean_y) = mean($x); + my $ssxx=ss($x,$mean_x,$mean_y,1,1); + my $ssyy=ss($x,$mean_x,$mean_y,2,2); + my $ssxy=ss($x,$mean_x,$mean_y,1,2); + my $correl=correl($ssxx,$ssyy,$ssxy);; + my $xcorrel=sprintf("%.2f",$correl); + return($xcorrel); + + sub mean + { + my ($x)=@_; + my $num = scalar(@{$x}) - 2; + my $sum_x = '0'; + my $sum_y = '0'; + for (my $i = 2; $i < scalar(@{$x}); ++$i) + { + $sum_x += $x->[$i][1]; + $sum_y += $x->[$i][2]; + } + my $mu_x = $sum_x / $num; + my $mu_y = $sum_y / $num; + return($mu_x,$mu_y); + } + + ### ss = sum of squared (deviations to the mean) + sub ss + { + my ($x,$mean_x,$mean_y,$one,$two)=@_; + my $sum = '0'; + for (my $i=2;$i<scalar(@{$x});++$i) + { + $sum += ($x->[$i][$one]-$mean_x)*($x->[$i][$two]-$mean_y); + } + return $sum; + } + + sub correl + { + my($ssxx,$ssyy,$ssxy)=@_; + + my ($sign, $correl) = (0,0); + if(abs($ssxy) == 0) { $sign = 0 } + else { $sign=$ssxy/abs($ssxy); } + + if( ($ssxx==0) || ($ssyy==0) ) { $correl = 0 } + else { $correl=$sign*sqrt($ssxy*$ssxy/($ssxx*$ssyy)); } + + return $correl; + } + } + } + # Complement bases (for the sequence context) + sub complement + { + if($_[0] eq "A") { return "T"; } + if($_[0] eq "C") { return "G"; } + if($_[0] eq "G") { return "C"; } + if($_[0] eq "T") { return "A"; } + } + # Create and write some graphics + sub Create_Graph + { + our ($folderFigure, $filename, $maxValue) = @_; + + # Open the connection with R + my $R = Statistics::R->new() or die "Impossible to create a communication bridge with R\n"; + $R->startR() ; + # Load the Library + $R->send(q`library(ggplot2)`); + $R->send(q`library(gplots)`); + $R->send(q`library(gtable)`); + + + $R->send(qq`########################################## + ## OVERALL MUTATION DISTRIBUTION ## + ########################################## + distrMut <- read.table("$folderFigure/Overall_mutation_distribution/$filename/$filename-OverallMutationDistribution.txt", header=T)`); + $R->send(q`# Add the count of each category in the legend + distrMut$Legend[[1]] <- paste0(distrMut$Variant_type[[1]], " (", distrMut$Count[[1]], ")") + distrMut$Legend[[2]] <- paste0(distrMut$Variant_type[[2]], " (", distrMut$Count[[2]], ")") + distrMut$Legend[[3]] <- paste0(distrMut$Variant_type[[3]], " (", distrMut$Count[[3]], ")")`); + + $R->send(qq`# Base plot + pie <- ggplot(distrMut, aes(x=factor(""), fill=Legend, weight=Count)) + geom_bar(width=1) + coord_polar(theta="y") + scale_x_discrete("", breaks=NULL) + scale_y_continuous("", breaks=NULL) + labs(fill="") + # Background of the plot entire white + pie <- pie + theme(panel.grid.major = element_line(colour="white"), panel.grid.minor = element_line(colour="white"), panel.background = element_rect(fill="white")) + # Legend on right in 3 rows + pie <- pie + theme(legend.position="bottom") + guides(fill=guide_legend(nrow=3)) + # Change the color and the title of the legend + pie <- pie + scale_fill_brewer("Variant type", palette="Set1") + # Remove all the margins + pie <- pie + theme(plot.margin=unit(c(-1, 0, -1.5, 0), "cm")) + # Save the pie chart for the HTML page (higher resolution) + options(bitmapType='cairo') # Use cairo device as isn't possible to install X11 on the server... + png("$folderFigure/Overall_mutation_distribution/$filename/$filename-OverallMutationDistribution.png", width=700, height=1100, res=300) + print(pie) + dev.off() + + + ########################################## + ## SBS MUTATION DISTRIBUTION ## + ########################################## + distrSBS <- read.delim("$folderFigure/SBS_distribution/$filename/$filename-SBS_distribution.txt") + distrSBS <- data.frame(distrSBS) + bar <- ggplot(distrSBS, aes(x=Mutation_Type, y=Percentage, fill=Mutation_Type)) + bar <- bar + geom_bar(stat="identity", width=0.5) + # Theme classic + bar <- bar + theme_classic() + # Remove the axis legend + bar <- bar + xlab("") + # Set the color of the bars and Changing the labels in the legend + bar <- bar + scale_fill_manual(values=c("blue", "black", "red", "gray", "#00CC33", "pink"), + labels=c("C:G>A:T", "C:G>G:C", "C:G>T:A", "T:A>A:T", "T:A>C:G", "T:A>G:C") + ) + # Remove the label in x axis + bar <- bar + theme(axis.text.x = element_blank()) + # Change the name of the y label + bar <- bar + ylab("Percent") + # Save the plot for the HTML page (higher resolution) + options(bitmapType='cairo') + png("$folderFigure/SBS_distribution/$filename/$filename-SBS_distribution.png", width=1800, height=1500, res=300) + print(bar); + dev.off() + # Save the plot for the report + bar + ggsave("$folder_temp/$filename-SBS_distribution-Report.png") + + + ########################################## + ## IMPACT ON PROTEIN ## + ########################################## + impactProt <- read.table("$folderFigure/Impact_protein_sequence/$filename/$filename-DistributionExoFunc.txt", header=T) + # Custom palette: black, orange, dark green, yellow, light blue, dark blue, darkslateblue, red, purple, pink, light green, turquoise, gray + cb_palette <- c("#000000", "#E69F00", "#006600", "#660000", "#F0E442", "#56B4E9", "#3300FF", "#483D8B", "#FF0000", "#9900CC", "#FF66CC", "#00CC00", "#66FFFF", "#C0C0C0") + pie <- ggplot(impactProt, aes(x=factor(""), fill=AA_Change, weight=Count)) + geom_bar(width=1) + coord_polar(theta="y") + scale_x_discrete("", breaks=NULL)+ scale_y_continuous("", breaks=NULL) + scale_fill_manual(values=cb_palette) + # Background of the plot entire white + pie <- pie + theme(panel.grid.major = element_line(colour="white"), panel.grid.minor = element_line(colour="white"), panel.background = element_rect(fill="white")) + # Legend in two column + pie <- pie + guides(fill=guide_legend(ncol=2)) + theme(legend.position="bottom") + # Remove the legend title + pie <- pie + labs(fill="") + # Save the plot for the HTML page (higher resolution) + options(bitmapType='cairo') + png("$folderFigure/Impact_protein_sequence/$filename/$filename-DistributionExoFunc.png", width=1600, height=1800, res=300) + print(pie) + dev.off() + # Save the plot for the report + pie + ggsave("$folder_temp/$filename-DistributionExoFunc-Report.png") + + + ########################################## + ## STRAND BIAS ## + ########################################## + cb_palette_SB <- c("#0072B2", "#CC0000") + file_sb <- read.table("$folderFigure/Stranded_Analysis/$filename/$filename-StrandBias.txt", header=T); + p_sb <- ggplot(file_sb, aes(x=Alteration, y=Count, fill=Strand)) + theme_classic() + geom_bar(stat="identity", position="dodge") + scale_fill_manual(values=cb_palette_SB) + theme(axis.text.x = element_text(angle=60, hjust=1)) + xlab("") + theme(legend.position="bottom") + # Save the plot for the HTML page (higher resolution) + options(bitmapType='cairo') + png("$folderFigure/Stranded_Analysis/$filename/$filename-StrandBias.png", width=1000, height=1200, res=300) + print(p_sb) + dev.off() + # Save the plot for the report + p_sb + ggsave("$folder_temp/$filename-StrandBias-Report.png") + + + ########################################## + ## HEATMAP SEQUENCE CONTEXT ## + ## GENOMIC STRAND ## + ########################################## + ## COUNT + heatmap_G <- read.table("$folderFigure/Trinucleotide_Sequence_Context/$filename/$filename-HeatmapCount-Genomic.txt", header=T) + # Save the plot for the report + options(bitmapType='cairo') + png(filename="$folder_temp/$filename-HeatmapCount-Genomic-Report.png", bg="transparent", width=240, height=360) + # Heatmap with an absolute scale + heatmap.2(as.matrix(heatmap_G),Rowv=F,Colv=F,col=colorpanel(384,low="yellow",high="red"),dendrogram="none",scale="none",trace="none",key=F,labRow=rownames(as.matrix(heatmap_G)),labCol=colnames(as.matrix(heatmap_G)),lmat=rbind(c(5,1,4),c(3,1,2)), lhei=c(0.75,0.75),lwid=c(0.5,1.5,0.5)) + dev.off() + # Save the plot for the HTML page (higher resolution) + options(bitmapType='cairo') + png(filename="$folderFigure/Trinucleotide_Sequence_Context/$filename/$filename-HeatmapCount-Genomic.png", width=1100, height=1600, res=300) + heatmap.2(as.matrix(heatmap_G),Rowv=F,Colv=F,col=colorpanel(384,low="yellow",high="red"),dendrogram="none",scale="none",trace="none",key=F,labRow=rownames(as.matrix(heatmap_G)),labCol=colnames(as.matrix(heatmap_G)),lmat=rbind(c(5,1,4),c(3,1,2)), lhei=c(0.75,0.75),lwid=c(0.5,1.5,0.5)) + dev.off() + + ## PERCENT + heatmap_G <- read.table("$folderFigure/Trinucleotide_Sequence_Context/$filename/$filename-HeatmapPercent-Genomic.txt", header=T) + # Save the plot for the report + options(bitmapType='cairo') + png(filename="$folder_temp/$filename-HeatmapPercent-Genomic-Report.png",bg="transparent", width=240, height=360) + # Heatmap with an absolute scale + heatmap.2(as.matrix(heatmap_G),Rowv=F,Colv=F,col=colorpanel(384,low="yellow",high="red"),dendrogram="none",scale="none",trace="none",key=F,labRow=rownames(as.matrix(heatmap_G)),labCol=colnames(as.matrix(heatmap_G)),lmat=rbind(c(5,1,4),c(3,1,2)), lhei=c(0.75,0.75),lwid=c(0.5,1.5,0.5)) + dev.off() + # Save the plot for the HTML page (higher resolution) + options(bitmapType='cairo') + png(filename="$folderFigure/Trinucleotide_Sequence_Context/$filename/$filename-HeatmapPercent-Genomic.png", width=1100, height=1600, res=300) + heatmap.2(as.matrix(heatmap_G),Rowv=F,Colv=F,col=colorpanel(384,low="yellow",high="red"),dendrogram="none",scale="none",trace="none",key=F,labRow=rownames(as.matrix(heatmap_G)),labCol=colnames(as.matrix(heatmap_G)),lmat=rbind(c(5,1,4),c(3,1,2)), lhei=c(0.75,0.75),lwid=c(0.5,1.5,0.5)) + dev.off()`); + $R->stopR() ; + + ## Plot the transcriptional strand bias in mutation signature + `Rscript $pathRScriptTxnSB $folderFigure/Stranded_Analysis/$filename/$filename-StrandedSignatureCount.txt $folderFigure/Stranded_Analysis/$filename/$filename-StrandedSignatureCount $folder_temp/$filename-StrandedSignatureCount Count 2>&1`; + `Rscript $pathRScriptTxnSB $folderFigure/Stranded_Analysis/$filename/$filename-StrandedSignaturePercent.txt $folderFigure/Stranded_Analysis/$filename/$filename-StrandedSignaturePercent $folder_temp/$filename-StrandedSignaturePercent Percent 2>&1`; + } + # Write the titles of the different sections of the report + sub WriteBoderSection + { + our ($wb, $ws, $rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, $nb_func, $colStart_matrixSeqContext) = @_; + + our ($format_topLeft, $format_topRight, $format_bottomLeft, $format_bottomRight, $format_top, $format_right, $format_bottom, $format_left); + Format_section($wb, \$format_topLeft, \$format_topRight, \$format_bottomLeft, \$format_bottomRight, \$format_top, \$format_right, \$format_bottom, \$format_left); + + TableSBSDistrBySeg(); + TableStrandBiasBySegment(); + CountSBSPerChr(); + ShortTriNtContext(); # 6 mut type + LongTriNtContext(); # 12 mut type + + sub TableSBSDistrBySeg + { + # Top-Left + $ws->write($rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, "Table 4. SBS distribution by functional region", $format_topLeft); $ws->set_row($rowStart_SBSdistrBySeg, 18); # Set the height of the row to 0.25" + # Top + for(my $i=1; $i<=13; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg+$i, $format_top); } + # Top-Right + $ws->write_blank($rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg+13, $format_topRight); + # Right + $ws->write_blank($rowStart_SBSdistrBySeg+1, $colStart_SBSdistrBySeg+13, $format_right); + # Bottom-left + $ws->write_blank($rowStart_SBSdistrBySeg+$nb_func+5, $colStart_SBSdistrBySeg, $format_bottomLeft); + # Left + $ws->write_blank($rowStart_SBSdistrBySeg+1, $colStart_SBSdistrBySeg, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg, $format_left); + } + + sub TableStrandBiasBySegment + { + # Top-Left + $ws->write($rowStart_SBSdistrBySeg+$nb_func+8, $colStart_SBSdistrBySeg, "Table 5. Strand bias by functional region", $format_topLeft); $ws->set_row($rowStart_SBSdistrBySeg+$nb_func+8, 18); # Set the height of the row to 0.25" + # Top + for(my $i=1; $i<=10; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg+$nb_func+8, $colStart_SBSdistrBySeg+$i, $format_top); } + # Top-Right + $ws->write_blank($rowStart_SBSdistrBySeg+$nb_func+8, $colStart_SBSdistrBySeg+11, $format_topRight); + # Right + $ws->write_blank($rowStart_SBSdistrBySeg+$nb_func+9, $colStart_SBSdistrBySeg+11, $format_right); $ws->write_blank($rowStart_SBSdistrBySeg+($nb_func*2)+13, $colStart_SBSdistrBySeg+11, $format_right); + # Left + $ws->write_blank($rowStart_SBSdistrBySeg+$nb_func+9, $colStart_SBSdistrBySeg, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+$nb_func+10, $colStart_SBSdistrBySeg, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+($nb_func*2)+13, $colStart_SBSdistrBySeg, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+($nb_func*2)+14, $colStart_SBSdistrBySeg, $format_left); + # Bottom + $ws->write_blank($rowStart_SBSdistrBySeg+($nb_func*3)+16, $colStart_SBSdistrBySeg+4, $format_bottom); $ws->write_blank($rowStart_SBSdistrBySeg+($nb_func*3)+16, $colStart_SBSdistrBySeg+8, $format_bottom); + } + + sub CountSBSPerChr + { + #### Top-Left + $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+4, $colStart_SBSdistrBySeg, "Table 6. SBS distribution per chromosome", $format_topLeft); $ws->set_row($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+4, 18); # Set the height of the row to 0.25" + #### Top + for(my $i=1; $i<8; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+4, $colStart_SBSdistrBySeg+$i, $format_top); } + #### Top-Right + $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+4, $colStart_SBSdistrBySeg+8, $format_topRight); + #### Right + $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+5, $colStart_SBSdistrBySeg+8, $format_right); $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+6, $colStart_SBSdistrBySeg+8, $format_right); + + #### Bottom-Right + # Human genome = 24 chromosomes + if($refGenome =~ /hg/) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+33, $colStart_SBSdistrBySeg+8, $format_bottomRight); } + # Mouse genome = 21 chromosomes + if($refGenome =~ /mm/) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+30, $colStart_SBSdistrBySeg+8, $format_bottomRight); } + # Rat genome = 22 chromosomes + if($refGenome =~ /rn/) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+31, $colStart_SBSdistrBySeg+8, $format_bottomRight); } + + #### Bottom + if($refGenome =~ /hg/) + { + $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+33, $colStart_SBSdistrBySeg+1, $format_bottom); + for(my $i=3; $i<=7; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+33, $colStart_SBSdistrBySeg+$i, $format_bottom); } + } + if($refGenome =~ /mm/) + { + $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+30, $colStart_SBSdistrBySeg+1, $format_bottom); + for(my $i=3; $i<=7; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+30, $colStart_SBSdistrBySeg+$i, $format_bottom); } + } + if($refGenome =~ /rn/) + { + $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+31, $colStart_SBSdistrBySeg+1, $format_bottom); + for(my $i=3; $i<=7; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+31, $colStart_SBSdistrBySeg+$i, $format_bottom); } + } + + #### Left + $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+5, $colStart_SBSdistrBySeg, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+6, $colStart_SBSdistrBySeg, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+7, $colStart_SBSdistrBySeg, $format_left); + + #### Bottom-left + if($refGenome =~ /hg/) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+33, $colStart_SBSdistrBySeg, $format_bottomLeft); } + if($refGenome =~ /mm/) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+30, $colStart_SBSdistrBySeg, $format_bottomLeft); } + if($refGenome =~ /rn/) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+31, $colStart_SBSdistrBySeg, $format_bottomLeft); } + } + + sub ShortTriNtContext + { + my $format_headerSection = $wb->add_format(valign => 'left', bold => 1, font => 'Arial', size => 12); + $format_headerSection->set_left(2); $format_headerSection->set_left_color('blue'); + + # Top-left + $ws->write(0, $colStart_matrixSeqContext, "Panel 1. Trinucleotide sequence context of SBS on the genomic sequence", $format_topLeft); + # Top + for(my $i=1; $i<=19; $i++) { $ws->write_blank(0, $colStart_matrixSeqContext+$i, $format_top); } + # Top-right + $ws->write_blank(0, $colStart_matrixSeqContext+20, $format_topRight); + # Right + for(my $i=1; $i<=37; $i++) { $ws->write_blank($i, $colStart_matrixSeqContext+20, $format_right); } + # Bottom-right + $ws->write_blank(37, $colStart_matrixSeqContext+20, $format_bottomRight); + # Bottom + for(my $i=1; $i<=19; $i++) { $ws->write_blank(38, $colStart_matrixSeqContext+$i, $format_top); } + # Bottom-left + $ws->write_blank(37, $colStart_matrixSeqContext, $format_bottomLeft); + # Left + $ws->write(1, $colStart_matrixSeqContext, "", $format_left); + for(my $i=3; $i<=36; $i++) { $ws->write_blank($i, $colStart_matrixSeqContext, $format_left); } + } + + sub LongTriNtContext + { + # Top-left + $ws->write($rowStart_SBSdistrBySeg, $colStart_matrixSeqContext, "Panel 2. Stranded analysis of trinucleotide sequence context of SBS", $format_topLeft); + # Top + for(my $i=1; $i<=28; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg, $colStart_matrixSeqContext+$i, $format_top); } + # Top-right + $ws->write_blank($rowStart_SBSdistrBySeg, $colStart_matrixSeqContext+29, $format_topRight); + # Right + for(my $i=1; $i<=42; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg+$i, $colStart_matrixSeqContext+29, $format_right); } + # Bottom-right + $ws->write_blank(91, $colStart_matrixSeqContext+29, $format_bottomRight); + # Bottom + for(my $i=13; $i<=28; $i++) { $ws->write_blank(92, $colStart_matrixSeqContext+$i, $format_top); } + # Bottom-left + $ws->write_blank(91, $colStart_matrixSeqContext, $format_bottomLeft); + # Left + $ws->write_blank($rowStart_SBSdistrBySeg+1, $colStart_matrixSeqContext, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_matrixSeqContext, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext, $format_left); + $ws->write_blank($rowStart_SBSdistrBySeg+22, $colStart_matrixSeqContext, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+23, $colStart_matrixSeqContext, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext, $format_left); + } + } + # Write the header for the six mutation types + sub WriteHeaderSection + { + our ($wb, $ws, $rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, $nb_func, $colStart_matrixSeqContext) = @_; + + our ($format_CA, $format_CG, $format_CT, $format_TA, $format_TC, $format_TG, $format_TG2, $format_LeftHeader, $format_RightHeader, $format_LeftHeader2); + Format_Header($wb, \$format_CA, \$format_CG, \$format_CT, \$format_TA, \$format_TC, \$format_TG, \$format_TG2, \$format_LeftHeader, \$format_RightHeader, \$format_LeftHeader2); + + our ($format_LeftCA, $format_LeftCG, $format_LeftCT, $format_LeftTA, $format_LeftTC, $format_LeftTG, $format_RightCA, $format_RightCG, $format_RightCT, $format_RightTA, $format_RightTC, $format_RightTG); + Format_HeaderSBSDistrBySegAndFunc($wb, \$format_LeftCA, \$format_LeftCG, \$format_LeftCT, \$format_LeftTA, \$format_LeftTC, \$format_LeftTG, \$format_RightCA, \$format_RightCG, \$format_RightCT, \$format_RightTA, \$format_RightTC, \$format_RightTG); + + our $format_A11Bold = ""; Format_A11Bold($wb, \$format_A11Bold); # Arial 11 bold and center + our $format_A11BoldLeft = ""; Format_A11BoldLeft($wb, \$format_A11BoldLeft); # Arial 11 bold and left + + our ($format_header12CA, $format_header12CG, $format_header12CT, $format_header12TA, $format_header12TC, $format_header12TG); + Format_Header12MutType($wb, \$format_header12CA, \$format_header12CG, \$format_header12CT, \$format_header12TA, \$format_header12TC, \$format_header12TG); + + ## Header for SBS distribution by segment + HeaderMutTypeSBSDistrBySeg(); + + ## Header for strand bias by function + $ws->set_column($colStart_SBSdistrBySeg+5, $colStart_SBSdistrBySeg+5, 11); + + my $row = $rowStart_SBSdistrBySeg+$nb_func+10; my $col = $colStart_SBSdistrBySeg; + $ws->write($row, $col+1, ' ', $format_CA); $ws->write($row, $col+2, "C>A", $format_CA); $ws->write($row, $col+3, ' ', $format_CA); + $ws->write($row, $col+5, ' ', $format_CG); $ws->write($row, $col+6, "C>G", $format_CG); $ws->write($row, $col+7, ' ', $format_CG); + $ws->write($row, $col+9, ' ', $format_CT); $ws->write($row, $col+10, "C>T", $format_CT); $ws->write($row, $col+11, ' ', $format_RightCT); + + $row = $rowStart_SBSdistrBySeg+($nb_func*2)+14; + $ws->write($row, $col+1, ' ', $format_TA); $ws->write($row, $col+2, "T>A", $format_TA); $ws->write($row, $col+3, ' ', $format_TA); + $ws->write($row, $col+5, ' ', $format_TC); $ws->write($row, $col+6, "T>C", $format_TC); $ws->write($row, $col+7, ' ', $format_TC); + $ws->write($row, $col+9, ' ', $format_TG2); $ws->write($row, $col+10, "T>G", $format_TG2); $ws->write($row, $col+11, ' ', $format_RightTG); + + $ws->set_row($rowStart_SBSdistrBySeg+$nb_func+11, 18); $ws->set_row($rowStart_SBSdistrBySeg+($nb_func*2)+15, 18); + $ws->set_column($colStart_SBSdistrBySeg+5, $colStart_SBSdistrBySeg+5, 13); $ws->set_column($colStart_SBSdistrBySeg+9, $colStart_SBSdistrBySeg+9, 13); + + for(my $i=$rowStart_SBSdistrBySeg+$nb_func+10; $i<=$rowStart_SBSdistrBySeg+($nb_func*2)+14; $i+=$nb_func+4) + { + $ws->write($i+1, $colStart_SBSdistrBySeg, 'Segment', $format_LeftHeader); $ws -> write($i+1, $colStart_SBSdistrBySeg+1, 'Non-Tr/Tr', $format_A11Bold); $ws -> write($i+1, $colStart_SBSdistrBySeg+2, 'Non-Tr', $format_A11Bold); $ws -> write($i+1, $colStart_SBSdistrBySeg+3, 'Tr', $format_A11Bold); + $ws -> write($i+1, $colStart_SBSdistrBySeg+5, 'Non-Tr/Tr', $format_A11Bold); $ws -> write($i+1, $colStart_SBSdistrBySeg+6, 'Non-Tr', $format_A11Bold); $ws -> write($i+1, $colStart_SBSdistrBySeg+7, 'Tr', $format_A11Bold); + $ws -> write($i+1, $colStart_SBSdistrBySeg+9, 'Non-Tr/Tr', $format_A11Bold); $ws -> write($i+1, $colStart_SBSdistrBySeg+10, 'Non-Tr', $format_A11Bold); $ws -> write($i+1, $colStart_SBSdistrBySeg+11, 'Tr', $format_RightHeader); + } + + + ## Header for Counts of SBS per chromosome and mutation type + HeaderCountSBSPerChr(); + + ## Header for the short sequence context + HeaderShortTriNtContext(); + + ## Header for the 12 mutation types with the sequence context (coding strand) + HeaderLongTriNtContext(); + + sub HeaderMutTypeSBSDistrBySeg + { + $ws->set_row($rowStart_SBSdistrBySeg+2, 18); + $ws->write($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+2, "C:G>A:T", $format_CA); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+3, $format_CA); + $ws->write($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+4, "C:G>G:C", $format_CG); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+5, $format_CG); + $ws->write($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+6, "C:G>T:A", $format_CT); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+7, $format_CT); + $ws->write($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+8, "T:A>A:T", $format_TA); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+9, $format_TA); + $ws->write($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+10, "T:A>C:G", $format_TC); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+11, $format_TC); + $ws->write($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+12, "T:A>G:C", $format_TG); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+13, $format_TG); + + $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg, "Segment", $format_LeftHeader); $ws->set_column($colStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, 13); $ws->set_row($rowStart_SBSdistrBySeg+3, 18); + $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+1, "Total SBS", $format_A11Bold); $ws->set_column($colStart_SBSdistrBySeg+1, $colStart_SBSdistrBySeg+1, 11); + $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+2, "%", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+3, "#", $format_A11Bold); + $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+4, "%", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+5, "#", $format_A11Bold); + $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+6, "%", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+7, "#", $format_A11Bold); + $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+8, "%", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+9, "#", $format_A11Bold); + $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+10, "%", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+11, "#", $format_A11Bold); + $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+12, "%", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, 13, "#", $format_RightHeader); + } + + sub HeaderCountSBSPerChr + { + $ws->set_column(3,3, 10); $ws->set_column(4,4, 10); + $ws->set_row($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, 18); + $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+7, $colStart_SBSdistrBySeg+1, "Pearson", $format_A11Bold); + $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg, "Chr", $format_LeftHeader); + $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+1, "Size", $format_A11Bold); + $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+2, "All SBS", $format_A11Bold); + + $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+3, "C:G>A:T", $format_CA); + $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+4, "C:G>G:C", $format_CG); + $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+5, "C:G>T:A", $format_CT); + $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+6, "T:A>A:T", $format_TA); + $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+7, "T:A>C:G", $format_TC); + $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+8, "T:A>G:C", $format_TG); + } + + sub HeaderShortTriNtContext + { + ### GENOMIC STRAND + $ws->write(2, $colStart_matrixSeqContext, 'Count matrix', $format_LeftHeader2); + $ws->write(3, $colStart_matrixSeqContext+4, 'C>A', $format_CA); $ws->write(3, $colStart_matrixSeqContext+5, 'C>G', $format_CG); $ws->write(3, $colStart_matrixSeqContext+6, 'C>T', $format_CT); $ws->write(3, $colStart_matrixSeqContext+7, 'T>A', $format_TA); $ws->write(3, $colStart_matrixSeqContext+8, 'T>C', $format_TC); $ws->write(3, $colStart_matrixSeqContext+9, 'T>G', $format_TG2); + + $ws->write(2, $colStart_matrixSeqContext+11, 'Frequency matrix', $format_A11BoldLeft); + $ws->write(3, $colStart_matrixSeqContext+14, 'C>A', $format_CA); $ws->write(3, $colStart_matrixSeqContext+15, 'C>G', $format_CG); $ws->write(3, $colStart_matrixSeqContext+16, 'C>T', $format_CT); $ws->write(3, $colStart_matrixSeqContext+17, 'T>A', $format_TA); $ws->write(3, $colStart_matrixSeqContext+18, 'T>C', $format_TC); $ws->write(3, $colStart_matrixSeqContext+19, 'T>G', $format_TG2); + + ### sequence context with a bar graph + $ws->write(25, $colStart_matrixSeqContext+10, "Mutation spectra frequency", $format_A11Bold); + } + + sub HeaderLongTriNtContext + { + $ws->set_row($rowStart_SBSdistrBySeg+3, 15); $ws->set_row($rowStart_SBSdistrBySeg+4, 15); $ws->set_row($rowStart_SBSdistrBySeg+5, 15); + $ws->write($rowStart_SBSdistrBySeg+3, $colStart_matrixSeqContext, "Count matrix", $format_LeftHeader2); + $ws->write($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+1, "C>A", $format_CA); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+2, $format_CA); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+1, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+2, "Tr", $format_A11Bold); + $ws->write($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+3, "C>G", $format_CG); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+4, $format_CG); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+3, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+4, "Tr", $format_A11Bold); + $ws->write($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+5, "C>T", $format_CT); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+6, $format_CT); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+5, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+6, "Tr", $format_A11Bold); + $ws->write($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+7, "T>A", $format_TA); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+8, $format_TA); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+7, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+8, "Tr", $format_A11Bold); + $ws->write($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+9, "T>C", $format_TC); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+10, $format_TC); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+9, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+10, "Tr", $format_A11Bold); + $ws->write($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+11, "T>G", $format_TG2); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+12, $format_TG2); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+11, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+12, "Tr", $format_A11Bold); + + + $ws->set_row($rowStart_SBSdistrBySeg+24, 15); $ws->set_row($rowStart_SBSdistrBySeg+25, 15); $ws->set_row($rowStart_SBSdistrBySeg+26, 15); + $ws->write($rowStart_SBSdistrBySeg+24, $colStart_matrixSeqContext, "Frequency matrix", $format_LeftHeader2); + $ws->write($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+1, "C>A", $format_CA); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+2, $format_CA); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+1, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+2, "Tr", $format_A11Bold); + $ws->write($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+3, "C>G", $format_CG); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+4, $format_CG); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+3, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+4, "Tr", $format_A11Bold); + $ws->write($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+5, "C>T", $format_CT); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+6, $format_CT); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+5, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+6, "Tr", $format_A11Bold); + $ws->write($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+7, "T>A", $format_TA); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+8, $format_TA); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+7, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+8, "Tr", $format_A11Bold); + $ws->write($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+9, "T>C", $format_TC); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+10, $format_TC); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+9, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+10, "Tr", $format_A11Bold); + $ws->write($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+11, "T>G", $format_TG2); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+12, $format_TG2); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+11, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+12, "Tr", $format_A11Bold); + } + } + # Create logo for representing the sequence context with n bases + sub CreateLogo + { + my ($refH_file, $folderWebLogo) = @_; + + my $folderSample = ""; + + foreach my $k_file (sort keys $refH_file) + { + $folderSample = "$folderWebLogo/$k_file"; + if(!-e $folderSample) { mkdir($folderSample) or die "Can't create the directory $folderSample\n"; } + + my $test_lengthSeqContext = 0; + + foreach my $k_mutation (sort keys $refH_file->{$k_file}{'WebLogo3'}) + { + open(WEBLOGO, ">", "$folderSample/$k_file-$k_mutation.fa") or die "$!: $folderSample/$k_file-$k_mutation.fa\n"; + foreach (@{$refH_file->{$k_file}{'WebLogo3'}{$k_mutation}}) + { + print WEBLOGO ">$k_file\n$_\n"; + + if(length($_) < 10) { $test_lengthSeqContext = 0; } + else { $test_lengthSeqContext = 1; } + } + close WEBLOGO; + } + + ## Generate the logo + foreach my $fastaFile (`ls $folderSample/*.fa`) + { + chomp($fastaFile); + my ($filename, $directories, $suffix) = fileparse("$folderSample/$fastaFile", qr/\.[^.]*/); + + $filename =~ /(.+)\-/; + my $title = $1; + + ## Test if there is fasta sequence for the mutation type + my $nbLigne_temp = `wc -l $fastaFile`; + my @nbLigne = split(" ", $nbLigne_temp); + + if($nbLigne[0] == 0) { print "WARNING: No sequence for $filename\n"; next; } + + # When length sequence context is lower than 10 the image is to small for adding a title + if($test_lengthSeqContext == 1) { system("weblogo -c classic -F png -U probability --title $title < $fastaFile > $folderSample/$filename-Probability.png"); } + else { system("weblogo -c classic -F png -U probability < $fastaFile > $folderSample/$filename-Probability.png"); } + } + } + } + + + # Define the format of the worksheet: Arial font size=10 + sub Format_A10 + { + my ($wb, $format) = @_; + $$format = $wb->add_format(font=>'Arial', size=>10); $$format->set_align('center'); + } + # Define the format of the worksheet: Arial font size=11 bold and center + sub Format_A11Bold + { + my ($wb, $format) = @_; + $$format = $wb->add_format(font=>'Arial', size=>11, bold=>1); $$format->set_align('center'); + } + # Define the format of the worksheet: Arial font size=10 italic red and center + sub Format_A10ItalicRed + { + my ($wb, $format) = @_; + $$format = $wb->add_format(font=>'Arial', size=>10, italic=>1, color => 'red'); $$format->set_align('center'); + } + # Defile the format of the worksheet: Arialt font size=11 bold and left + sub Format_A11BoldLeft + { + my ($wb, $format) = @_; + $$format = $wb->add_format(valign =>'left', font=>'Arial', size=>11, bold=>1); + } + # Defile the format of the worksheet: Arialt font size=10 bold and left + sub Format_A10BoldLeft + { + my ($wb, $format) = @_; + $$format = $wb->add_format(valign =>'left', font=>'Arial', size=>10, bold=>1); + } + # Define the format of the border of the section (for delimiting the different section of the report) + sub Format_section + { + my ($wb, $format_topLeft, $format_topRight, $format_bottomLeft, $format_bottomRight, $format_top, $format_right, $format_bottom, $format_left) = @_; + + $$format_topLeft = $wb->add_format(valign => 'left', bold => 1, font => 'Arial', size => 12); + $$format_topLeft->set_top(2); $$format_topLeft->set_top_color('blue'); + $$format_topLeft->set_left(2); $$format_topLeft->set_left_color('blue'); + + $$format_topRight = $wb->add_format(valign => 'left', bold => 1, font => 'Arial', size => 12); + $$format_topRight->set_top(2); $$format_topRight->set_top_color('blue'); + $$format_topRight->set_right(2); $$format_topRight->set_right_color('blue'); + + $$format_bottomLeft = $wb->add_format(valign => 'left', bold => 1, font => 'Arial', size => 12); + $$format_bottomLeft->set_bottom(2); $$format_bottomLeft->set_bottom_color('blue'); + $$format_bottomLeft->set_left(2); $$format_bottomLeft->set_left_color('blue'); + + $$format_bottomRight = $wb->add_format(valign => 'left', bold => 1, font => 'Arial', size => 12); + $$format_bottomRight->set_bottom(2); $$format_bottomRight->set_bottom_color('blue'); + $$format_bottomRight->set_right(2); $$format_bottomRight->set_right_color('blue'); + + $$format_top = $wb->add_format(); $$format_top->set_top(2); $$format_top->set_top_color('blue'); + $$format_right = $wb->add_format(); $$format_right->set_right(2); $$format_right->set_right_color('blue'); + $$format_bottom = $wb->add_format(); $$format_bottom->set_bottom(2); $$format_bottom->set_bottom_color('blue'); + $$format_left = $wb->add_format(); $$format_left->set_left(2); $$format_left->set_left_color('blue'); + } + # Define the header + sub Format_Header + { + my ($wb, $format_CA, $format_CG, $format_CT, $format_TA, $format_TC, $format_TG, $format_TG2, $format_LeftHeader, $format_RightHeader, $format_LeftHeader2) = @_; + + my ($blue, $black, $red, $gray, $green, $pink); + Color($wb, \$blue, \$black, \$red, \$gray, \$green, \$pink); + + my ($bgColor_blue, $bgColor_black, $bgColor_red, $bgColor_gray, $bgColor_green, $bgColor_pink); + BackgroundColor($wb, \$bgColor_blue, \$bgColor_black, \$bgColor_red, \$bgColor_gray, \$bgColor_green, \$bgColor_pink); + + + $$format_CA = $wb->add_format(bg_color => $blue, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_CA->set_align('center'); $$format_CA->set_center_across(); + $$format_CG = $wb->add_format(bg_color => $black, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_CG->set_align('center'); $$format_CG->set_center_across(); + $$format_CT = $wb->add_format(bg_color => $red, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_CT->set_align('center'); $$format_CT->set_center_across(); + $$format_TA = $wb->add_format(bg_color => $gray, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TA->set_align('center'); $$format_TA->set_center_across(); + $$format_TC = $wb->add_format(bg_color => $green, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TC->set_align('center'); $$format_TC->set_center_across(); + $$format_TG = $wb->add_format(bg_color=>$bgColor_pink, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TG->set_align('center'); $$format_TG->set_center_across(); + $$format_TG->set_right(2); $$format_TG->set_right_color('blue'); + + $$format_TG2 = $wb->add_format(bg_color => $pink, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TG2->set_align('center'); $$format_TG2->set_center_across(); + + $$format_LeftHeader = $wb->add_format(bold=>1, font=>'Arial', size=>11); $$format_LeftHeader->set_align('center'); $$format_LeftHeader->set_left(2); $$format_LeftHeader->set_left_color('blue'); + $$format_LeftHeader2 = $wb->add_format(bold=>1, font=>'Arial', size=>11); $$format_LeftHeader2->set_left(2); $$format_LeftHeader2->set_left_color('blue'); + $$format_RightHeader = $wb->add_format(bold=>1, font=>'Arial', size=>11); $$format_RightHeader->set_align('center'); $$format_RightHeader->set_right(2); $$format_RightHeader->set_right_color('blue'); + } + # Define the mutation type header for the Strand bias by segment + sub Format_HeaderSBSDistrBySegAndFunc + { + my ($wb, $format_LeftCA, $format_LeftCG, $format_LeftCT, $format_LeftTA, $format_LeftTC, $format_LeftTG, $format_RightCA, $format_RightCG, $format_RightCT, $format_RightTA, $format_RightTC, $format_RightTG) = @_; + + my ($bgColor_blue, $bgColor_black, $bgColor_red, $bgColor_gray, $bgColor_green, $bgColor_pink); + BackgroundColor($wb, \$bgColor_blue, \$bgColor_black, \$bgColor_red, \$bgColor_gray, \$bgColor_green, \$bgColor_pink); + + $$format_LeftCA = $wb->add_format(bg_color=>$bgColor_blue, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_LeftCA->set_align('center'); $$format_LeftCA->set_left(2); $$format_LeftCA->set_left_color('blue'); + $$format_LeftCG = $wb->add_format(bg_color=>$bgColor_black, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_LeftCG->set_align('center'); $$format_LeftCG->set_left(2); $$format_LeftCG->set_left_color('blue'); + $$format_LeftCT = $wb->add_format(bg_color=>$bgColor_red, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_LeftCT->set_align('center'); $$format_LeftCT->set_left(2); $$format_LeftCT->set_left_color('blue'); + $$format_LeftTA = $wb->add_format(bg_color=>$bgColor_gray, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_LeftTA->set_align('center'); $$format_LeftTA->set_left(2); $$format_LeftTA->set_left_color('blue'); + $$format_LeftTC = $wb->add_format(bg_color=>$bgColor_green, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_LeftTC->set_align('center'); $$format_LeftTC->set_left(2); $$format_LeftTC->set_left_color('blue'); + $$format_LeftTG = $wb->add_format(bg_color=>$bgColor_pink, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_LeftTG->set_align('center'); $$format_LeftTG->set_left(2); $$format_LeftTG->set_left_color('blue'); + + + $$format_RightCA = $wb->add_format(bg_color=>$bgColor_blue, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_RightCA->set_align('center'); $$format_RightCA->set_right(2); $$format_RightCA->set_right_color('blue'); + $$format_RightCG = $wb->add_format(bg_color=>$bgColor_black, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_RightCG->set_align('center'); $$format_RightCG->set_right(2); $$format_RightCG->set_right_color('blue'); + $$format_RightCT = $wb->add_format(bg_color=>$bgColor_red, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_RightCT->set_align('center'); $$format_RightCT->set_right(2); $$format_RightCT->set_right_color('blue'); + $$format_RightTA = $wb->add_format(bg_color=>$bgColor_gray, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_RightTA->set_align('center'); $$format_RightTA->set_right(2); $$format_RightTA->set_right_color('blue'); + $$format_RightTC = $wb->add_format(bg_color=>$bgColor_green, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_RightTC->set_align('center'); $$format_RightTC->set_right(2); $$format_RightTC->set_right_color('blue'); + $$format_RightTG = $wb->add_format(bg_color=>$bgColor_pink, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_RightTG->set_align('center'); $$format_RightTG->set_right(2); $$format_RightTG->set_right_color('blue'); + } + # Define the mutation type header for the trinucleotide sequence context on the coding strand + sub Format_Header12MutType + { + my ($wb, $format_CA, $format_CG, $format_CT, $format_TA, $format_TC, $format_TG) = @_; + + my ($bgColor_blue, $bgColor_black, $bgColor_red, $bgColor_gray, $bgColor_green, $bgColor_pink); + BackgroundColor($wb, \$bgColor_blue, \$bgColor_black, \$bgColor_red, \$bgColor_gray, \$bgColor_green, \$bgColor_pink); + + $$format_CA = $wb->add_format(bg_color=>$bgColor_blue, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_CA->set_align('center'); + $$format_CG = $wb->add_format(bg_color=>$bgColor_black, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_CG->set_align('center'); + $$format_CT = $wb->add_format(bg_color=>$bgColor_red, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_CT->set_align('center'); + $$format_TA = $wb->add_format(bg_color=>$bgColor_gray, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TA->set_align('center'); + $$format_TC = $wb->add_format(bg_color=>$bgColor_green, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TC->set_align('center'); + $$format_TG = $wb->add_format(bg_color=>$bgColor_pink, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TG->set_align('center'); + } + # Define the format for the text that needs a section border + sub Format_TextSection + { + my ($wb, $formatT_left, $formatT_right, $formatT_bottomRight, $formatT_bottomLeft, $formatT_bottom, $formatT_bottomHeader, $formatT_bottomRightHeader, $formatT_bottomHeader2, $formatT_rightHeader) = @_; + + $$formatT_left = $wb->add_format(valign=>'center', font=>'Arial', size=>10); + $$formatT_left->set_left(2); $$formatT_left->set_left_color('blue'); + + $$formatT_right = $wb->add_format(valign=>'center', font=>'Arial', size=>10); + $$formatT_right->set_right(2); $$formatT_right->set_right_color('blue'); + + $$formatT_bottomRight = $wb->add_format(valign=>'center', font=>'Arial', size=>10); + $$formatT_bottomRight->set_bottom(2); $$formatT_bottomRight->set_bottom_color('blue'); + $$formatT_bottomRight->set_right(2); $$formatT_bottomRight->set_right_color('blue'); + + $$formatT_bottomLeft = $wb->add_format(valign=>'center', font=>'Arial', size=>10); + $$formatT_bottomLeft->set_bottom(2); $$formatT_bottomLeft->set_bottom_color('blue'); + $$formatT_bottomLeft->set_left(2); $$formatT_bottomLeft->set_left_color('blue'); + + $$formatT_bottom = $wb->add_format(valign=>'center', font=>'Arial', size=>10); + $$formatT_bottom->set_bottom(2); $$formatT_bottom->set_bottom_color('blue'); + + my $bgColor_totallighGray = $wb->set_custom_color(54, 230, 230, 230); + $$formatT_bottomHeader = $wb->add_format(bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>11); $$formatT_bottomHeader->set_align('center'); + $$formatT_bottomHeader->set_bottom(2); $$formatT_bottomHeader->set_bottom_color('blue'); + + $$formatT_bottomRightHeader = $wb->add_format(bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>11); $$formatT_bottomRightHeader->set_align('center'); + $$formatT_bottomRightHeader->set_bottom(2); $$formatT_bottomRightHeader->set_bottom_color('blue'); + $$formatT_bottomRightHeader->set_right(2); $$formatT_bottomRightHeader->set_right_color('blue'); + + $$formatT_bottomHeader2 = $wb->add_format(bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>11); $$formatT_bottomHeader2->set_align('center'); + + $$formatT_rightHeader = $wb->add_format(bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>11); $$formatT_rightHeader->set_align('center'); + $$formatT_rightHeader->set_right(2); $$formatT_rightHeader->set_right_color('blue'); + } + # Define the format for the graphs titles + sub Format_GraphTitle + { + my ($wb, $formatT_graphTitle) = @_; + + $$formatT_graphTitle = $wb->add_format(font=>'Arial', size=>12, bold=>1); + } + # Define the format of the border of the tables + sub Format_Table + { + my ($wb, $table_topleft, $table_topRight, $table_bottomleft, $table_bottomRight, $table_top, $table_right, $table_bottom, $table_bottomItalicRed, $table_left, $table_bottomrightHeader, $table_left2, $table_middleHeader, $table_middleHeader2) = @_; + + $$table_topleft = $wb->add_format(valign=>'center', bold=>1, font=>'Arial', size=>10); $$table_topleft->set_top(1); $$table_topleft->set_left(1); + $$table_topRight = $wb->add_format(valign=>'center', bold=>1, font=>'Arial', size=>10); $$table_topRight->set_top(1); $$table_topRight->set_right(1); + $$table_bottomleft = $wb->add_format(valign=>'center', bold=>1, font=>'Arial', size=>10); $$table_bottomleft->set_bottom(1); $$table_bottomleft->set_left(1); + $$table_bottomRight = $wb->add_format(valign=>'center', font=>'Arial', size=>10); $$table_bottomRight->set_bottom(1); $$table_bottomRight->set_right(1); + + $$table_top = $wb->add_format(valign=>'center', bold=>1, font=>'Arial', size=>10); $$table_top->set_top(1); + $$table_right = $wb->add_format(valign=>'center', font=>'Arial', size=>10); $$table_right->set_right(1); + $$table_bottom = $wb->add_format(valign=>'center', font=>'Arial', size=>10); $$table_bottom->set_bottom(1); + $$table_bottomItalicRed = $wb->add_format(valign=>'center', font=>'Arial', size=>10, italic=>1, color => 'red'); $$table_bottomItalicRed->set_bottom(1); + $$table_left = $wb->add_format(valign=>'center', bold=>1, font=>'Arial', size=>10); $$table_left->set_left(1); + + my $bgColor_totallighGray = $wb->set_custom_color(54, 230, 230, 230); + $$table_bottomrightHeader = $wb->add_format(bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>10); $$table_bottomrightHeader->set_bottom(1); $$table_bottomrightHeader->set_right(1); + + $$table_left2 = $wb->add_format(valign=>'left', font=>'Arial', size=>10); $$table_left2->set_left(1); + + $$table_middleHeader = $wb->add_format(valign=>'center', bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>10); + $$table_middleHeader2 = $wb->add_format(valign=>'center', bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>10); $$table_middleHeader2->set_bottom(1); + } + + # Define the color + sub Color + { + my ($wb, $blue, $black, $red, $gray, $green, $pink) = @_; + + $$blue = $wb->set_custom_color(40, 0, 0, 204);# C:G>A:T in blue + $$black = $wb->set_custom_color(41, 0, 0, 0);# C:G>G:C in black + $$red = $wb->set_custom_color(42, 255, 0, 0);# C:G>T:A in red + $$gray = $wb->set_custom_color(43, 205, 205, 205); # T:A>A:T in light gray + $$green = $wb->set_custom_color(44, 0, 204, 51);# T:A>C:G in green + $$pink = $wb->set_custom_color(45, 255, 192, 203);# T:A>G:C in pink + } + sub BackgroundColor + { + my ($wb, $bgColor_blue, $bgColor_black, $bgColor_red, $bgColor_gray, $bgColor_green, $bgColor_pink) = @_; + + $$bgColor_blue = $wb->set_custom_color(48, 0, 0, 204); + $$bgColor_black = $wb->set_custom_color(49, 0, 0, 0); + $$bgColor_red = $wb->set_custom_color(50, 255, 0, 0); + $$bgColor_gray = $wb->set_custom_color(51, 205, 205, 205); + $$bgColor_green = $wb->set_custom_color(52, 0, 204, 51); + $$bgColor_pink = $wb->set_custom_color(53, 255, 192, 203); + } +} + + +sub recoverNumCol +{ + my ($input, $name_of_column) = @_; + + open(F1,$input) or die "recoverNumCol: $!: $input\n"; + # For having the name of the columns + my $search_header = <F1>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header); + close F1; + # The number of the column + my $name_of_column_NB = "toto"; + for(my $i=0; $i<=$#tab_search_header; $i++) + { + if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; last; } + } + if($name_of_column_NB eq "toto") { print STDERR "Error recoverNumCol(): the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; } + else { return $name_of_column_NB; } +} + + + + +=head1 NAME + +mutSpec-Stat + +=head1 SYNOPSIS + + mutSpecstat.pl [arguments] <query-file> + + <query-file> can be a folder with multiple VCF or a single VCF + + Arguments: + -h, --help print help message + -m, --man print complete documentation + -v, --verbose use verbose output + --refGenome the reference genome to use (human, mouse or rat genomes) + -o, --outfile <string> output directory for the result. If none is specify the result will be write in the same directory as the input file + -temp --pathTemporary <string> the path for saving the temporary files + --pathSeqRefGenome the path to the fasta reference sequences + --poolData generate the pool of all the samples (optional) + --reportSample generate a report for each sample (optional) + + +Function: automatically run a pipeline and calculate various statistics on mutations + + Example: mutSpecstat.pl --refGenome hg19 --outfile output_directory --temp path_to_temporary_directory --pathRscript path_to_R_scripts --pathSeqRefGenome path_fasta_ref_seq --poolData --reportSample input + + Version: 04-2016 (April 2016) + + +=head1 OPTIONS + +=over 8 + +=item B<--help> + +print a brief usage message and detailed explanation of options. + +=item B<--man> + +print the complete manual of the program. + +=item B<--verbose> + +use verbose output. + +=item B<--refGenome> + +the reference genome to use, could be human, mouse or rat genomes. + +=item B<--outfile> + +the directory of output file names. If it is nor specify the same directory as the input file is used. + +=item B<--pathTemporary> + +the path for saving temporary files generated by the script. +If any is specify a temporary folder is created in the same directory where the script is running. +Deleted when the script is finish + +=item B<--pathSeqRefGenome> + +The path to the fasta reference sequences + +=item B<--poolData only for the report> + +calculate the statistics on the pool of all the data pass in input + +=item B<--reportSample only for the report> + +generate a report for each samples + +=head1 DESCRIPTION + +mutSpecstat is a perl script for calculated various statistics on mutations +An Excel report containing the mutation type distribution per functional region, the strand bias and the sequence context on genomic and coding sequence is created. +The different statistics are illustrated using ggplot2. + +=cut
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecStat.xml Tue Apr 19 03:07:11 2016 -0400 @@ -0,0 +1,157 @@ +<tool id="mutSpecStat" name="MutSpec Stat" version="0.1" hidden="false"> +<description>Calculate various statistics on mutations</description> + +<requirements> + <requirement type="set_environment">SCRIPT_PATH</requirement> + <requirement type="package" version="5.18.1">perl</requirement> + <requirement type="package" version="3.3">weblogo</requirement> + <requirement type="package" version="1.7.1">numpy</requirement> + <requirement type="package" version="3.1.2">R</requirement> + <requirement type="package" version="0.1">mutspec</requirement> +</requirements> + +<command interpreter="bash"> + mutspecStat_wrapper.sh + $html + ${GALAXY_DATA_INDEX_DIR}/shared/ucsc/chrom/ + #if $estimateSignature.estimSign == "true": + ${estimateSignature.estimT} + #else + 0 + #end if + + "--refGenome ${refGenome} --pathSeqRefGenome ${refGenome.fields.path} $pooldata $reportSample" + #import re + #for $f in $dataset_list + #set $regexp = $re.compile("\((.*)\)") + #if $regexp.search($f.name) + #set filename=$regexp.search($f.name) + "$f=${filename.group(1)}" + #else + "$f=${f.name}" + #end if + #end for +</command> + +<inputs> + <param name="dataset_list" type="data_collection" format="tabular" collection_type="list" label="Annotated Dataset List" help="Select a dataset list/collection from your history" /> + <param name="refGenome" type="select" label="Reference genome" help="All data in your dataset list should have been generated with the selected genome"> + <options from_data_table="annovar_index" /> + </param> + + <param name="pooldata" type="boolean" checked="true" truevalue="--pooldata" falsevalue="" label="Include statistics on the pooled samples" /> + <param name="reportSample" type="boolean" checked="false" truevalue="--reportSample" falsevalue="" label="Generate one output file for each sample" help="By default, one output Excel file will be generated with statistics of each sample shown in different data sheets. Setting this option to true will generate one Excel file for each sample instead. It is recommended to use this option if your dataset list contains more than 250 files as the Excel output file may be too heavy to open easily on a computer with limited RAM"/> + + <conditional name="estimateSignature"> + <param name="estimSign" type="boolean" label="Compute statistics for estimating the number of signatures" help="This option gererates different statistics that can be used to estimate the number of signatures to extract with NMF (this number should be used in the MutSpec-NMF tool"/> + <when value="true"> + <param name="estimT" type="text" value="8" label="Maximum number of signatures to compute" help="Warning: Selecting a number above 8 may not work on small datasets"/> + </when> + </conditional> + +</inputs> + +<outputs> + <data name="html" type="data" format="html" label="mutation spectra report on ${dataset_list.name}" /> +</outputs> + +<stdio> + <regex match="FutureWarning" + source="both" + level="warning" + description="FutureWarning" /> +</stdio> + +<help> + +**What it does** + +MutSpec-Stat calculates various statistics describing mutation characteristics extracted from a dataset collection, and estimate (optional) the number of signatures present in the dataset. +The statistics include overall distribution of mutations, mutation distribution for single base substitutions (SBS) by functional regions, chromosomes, or in their trinucleotide sequence context (see details below). + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Input formats** + +The tool accepts a dataset list + +.. class:: infomark + +You should thus create a dataset list even when using one file (see Galaxy help to learn `how to create a dataset list`__) + +.. __: https://wiki.galaxyproject.org/Histories#Dataset_Collections + +.. class:: warningmark + +The input files must have been generated by the MutSpec-Annot tool (so they contain the required annotations). + +-------------------------------------------------------------------------------------------------------------------------------------------------- + +**Output** + +MutSpec-Stat generates an html page with links to : + - an Excel file that includes all computed statistics shown in tabular and graphical formats, for each sample (one by datasheet) and for the pooled samples (optional), + - html pages for individual sample results, + - the input matrix for the tool MutSpec-NMF, + - the result of the estimation of the number of signatures (if the option "Compute statistics for estimating the number of signatures" was selected). + +The following statistics are generated: + +**Graph 1. SBS distribution** +Proportion (percent of all SBS) of each type of single base substitution (SBS). +All SBS are considered, including the ones without strand orientation annotation. + +**Table 1. Frequency and counts of all SBS** +Values corresponding to graph 1. + + +**Graph 2. Impact on protein sequence** +Impact of all mutations (SBS and Indel) on the protein sequence based on the ExonicFunc.refGene annotation. +For more details about the annotation, please visit the `Annovar web page`__ + +.. __: http://www.openbioinformatics.org/annovar/annovar_gene.html#output1 + + +**Table 2. Frequency and counts of functional impacts** +Values corresponding to graph 2. + + +**Graph 3. Stranded distribution of SBS** +Proportion (percent of all SBS with strand annotation) of the six substitution types on the transcribed and non-transcribed strand. +Only regions with strand annotation are considered. + +**Table 3. Significance of the strand biases** +The strand bias for each SBS type is calculated as the ratio of SBS on the non-transcribed (coding) versus the transcribed (non-coding) strand. +The statistical significance of the differences between the mutational frequencies on the non-transcribed and the +transcribed strand (equal to 0.5, as expected by chance) is assessed using a chi-squared test followed by the Benjamini- +Hochberg procedure for multiple testing corrections (only samples with at least 1 mutations on the non-transcribed or on the transcribed strand are considered). +Two tables are shown to display the 6 SBS types in both orientations. + + +**Table 4. SBS distribution by functional region** +Count and percentages of SBS in genomic regions based on the Func.refGene annotation. + + +**Table 5. Strand bias by functional region** +Counts of the strand bias for the 6 SBS types in different functional regions. + + +**Table 6. SBS distribution per chromosome** +Counts of SBS per chromosome for the six SBS types. +The correlation between SBS counts and chromosome size is calculated using a Pearson correlation test. + + +**Panel 1. Trinucleotide sequence context of SBS on the genomic sequence** +The trinucleotide sequence context takes into consideration the flanking base in 5' and in 3' of the SBS. +SBS counts and frequency data are shown as tables, heatmaps or bar graphs. The heatmap colors are scaled to the maximum value of the corresponding table. The bar graph is scaled to the maximum frequency value (total number of mutation by SBS type is shown in parenthesis). + + + +**Panel 2. Stranded analysis of trinucleotide sequence context of SBS** +SBS within their trinucleotide sequence context are counted on the non-transcribed and transcribed strands of the gene region they are located in. Counts and frequencies are shown as tables or bar graphs. +Only SBS with strand orientation annotation are considered in this analysis (strand annotation retrieved from RefSeq database). + + +</help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutspecStat_wrapper.sh Tue Apr 19 03:07:11 2016 -0400 @@ -0,0 +1,507 @@ +#!/bin/bash + +######################################### +### SPECIFY THE NUMBER OF CPU ### +######################################### +cpu=1 + + + + + +######################################### +### Recover the arguments ### +######################################### +html=$1;shift +len_file_path=$1;shift +estimSign=$1;shift +parameters=$1;shift +working_dir=`pwd` + + + +mkdir in +cd in + +names=$(sed 's/\s/_/g' <<< $*) +names=$(sed 's/_\// \//g' <<< $names) +names=$(sed 's/_annotated//g' <<< $names) +names=$(sed 's/_filtered//g' <<< $names) +names=$(sed 's/\.txt_/_/' <<< $names) + +for name in ${names} +do + file=$(sed 's/=/ /' <<< $name); + echo $file + ln -s $file +done +cd .. + +output_dir=${html%%.*}_files + + +######################################### +### Calculates the statistics ### +######################################### + +perl $SCRIPT_PATH/mutspecStat.pl --outfile $output_dir \ + --temp "$working_dir/temp" \ + --pathRscript $SCRIPT_PATH \ + $parameters \ + $working_dir/in + + +######################################### +### Estimate the number of signatures ### +######################################### +if [[ $estimSign > 0 ]]; then + Rscript $SCRIPT_PATH/R/estimateSign_Galaxy.r --input $output_dir/Mutational_Analysis/Figures/Input_NMF/Input_NMF_Count.txt --stop $estimSign --cpu $cpu --output $output_dir/Mutational_Analysis/Figures/Estimate_Number_Signatures.png 2>&1 +fi + + +######################################### +### Create css # +######################################### +css=$output_dir/Mutational_Analysis/style.css +echo ".legend{position:relative}.legend .legend-hidden{display:none;position:absolute;background-color:#fff;border:3px solid #03F;padding:3px;color:#000;font-size:1em;border-radius:10px;margin-top:-40px}.legend:hover .legend-hidden{display:block}" > $css + + + +# HMTL page for the result of the tool +echo "<html>" >> $html +echo "<body>" >> $html + +if [ -d $output_dir/Mutational_Analysis/Figures ]; then + +echo "<center> <h2>Mutational spectra report summary</h2> </center>" >> $html + +echo "<br/> Download the full report in Excel" >> $html + +## One report with all the samples. Specify the full path +if [[ -e "$output_dir/Mutational_Analysis/Report_Mutation_Spectra.xls" ]] +then + # Interpreted by Galaxy so don't need the full path + echo "<br/><a href="Mutational_Analysis/Report_Mutation_Spectra.xls">Report_Mutation_Spectra.xls</a>" >> $html +fi +## One report for each samples +for file in $names +do + name=$(echo ${file}| cut -d"=" -f2) + name=${name%.*} + + # One report for each samples + if [[ -e "$output_dir/Mutational_Analysis/Report_Mutation_Spectra-$name.xls" ]] + then + echo "<br/><a href="Mutational_Analysis/Report_Mutation_Spectra-$name.xls">Report_Mutation_Spectra-$name.xls</a>" >> $html + fi +done +## One report for each samples: Pool_Data +if [[ $parameters =~ "--pooldata" ]]; then + if [[ -e "$output_dir/Mutational_Analysis/Report_Mutation_Spectra-Pool_Data.xls" ]]; then + echo "<br/><a href="Mutational_Analysis/Report_Mutation_Spectra-Pool_Data.xls">Report_Mutation_Spectra-Pool_Data.xls</a>" >> $html + fi +fi + + +## Input file for NMF +if [[ -e "$output_dir/Mutational_Analysis/Figures/Input_NMF/Input_NMF_Count.txt" ]] +then + # Interpreted by Galaxy so don't need the full path + echo "<br/><br/> Download the input file for the tool mutSpec-NMF" >> $html + echo "<br/><a href="Mutational_Analysis/Figures/Input_NMF/Input_NMF_Count.txt">Input_NMF_Count.txt</a><br/>" >> $html +fi + +## Computed statistics for estimating the number of signatures +if [[ $estimSign > 0 ]]; then + echo "<br/> Link to the computed statistics for estimating the number of signatures <br/>" >> $html + if [[ -e "$output_dir/Mutational_Analysis/Figures/Estimate_Number_Signatures.png" ]]; then + outEstimateSign="$output_dir/Mutational_Analysis/EstimatingSignatures.html" + touch $outEstimateSign + echo "<a href='Mutational_Analysis/EstimatingSignatures.html'>Estimating the number of signatures</a><br/>" >> $html + echo "<br/> <center> <h2>Computed statistics for estimating the number of signatures</h2> </center> <br/>" >> $outEstimateSign + echo "Several approaches have been proposed to choose the optimal number of signatures to extract with NMF. <br/> + Brunet et al. 2004, proposed to take the first number of signature for which the cophenetic coefficient starts decreasing, <br/> + Hutchins et al. 2008, suggested to choose the first value where the RSS curve presents an inflection point. <br/> + Frigyesi et al. 2008, considered the smallest value at which the decrease in the RSS is lower than the decrease of the RSS obtained from random data. <br/><br/> + The estimation are based on Brunet’s algorithm computed from 50 runs for each value of signature to estimate. <br/> <br/> + The original data are shuffled for comparing the quality measures obtained with our data (Data x) and from randomized data (Data y). The curves for the actual data are in solid line, those for the randomized data are in dashed line. <br/> <br/>" >> $outEstimateSign + echo "<img src="Figures/Estimate_Number_Signatures.png width="1000""/><br/></td>" >> $outEstimateSign + else + echo "<br/>There is not enough mutations for estimating the number of signatures <br/>" >> $html + echo "Read the tool standard output for more detail<br/>" >> $html + fi +fi + + +## HMTL Link to the samples +echo "<br/> Link to individual samples <br/>" >> $html +for file in $names +do + name=$(echo ${file}| cut -d"=" -f2) + name=${name%.*} + outfile="$output_dir/Mutational_Analysis/$name.html" + touch $outfile # Create an empty file named $outfile + echo "<a href='Mutational_Analysis/$name.html'>$name</a><br/>" >> $html + +#-------------------------------------------------------------------------------------------------------------------------------------------------------------------------# +# INDIVIDUAL SAMPLES # +#-------------------------------------------------------------------------------------------------------------------------------------------------------------------------# +echo "<br/> <center> <h2>Mutational Spectra report for $name</h2> </center> <br/>" >> $outfile + +echo "<html>" >> $outfile + +echo "<head>" >> $outfile +echo "<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">" >> $outfile +# Link to the css style file for having a legend when we pass the mouse on the figures +echo "<link rel="stylesheet" href="style.css" />" >> $outfile +echo "</head>" >> $outfile + +echo "<body>" >> $outfile + +echo "<table>" >> $outfile +echo "<tr>" >> $outfile +echo "<th><h3>Overall mutation distribution</h3></th>" >> $outfile +echo "<th><h3>Impact on protein sequence</h3></th>" >> $outfile +echo "</tr><tr>" >> $outfile +echo "<td> <center> <a href="Figures/Overall_mutation_distribution/$name/$name-OverallMutationDistribution.txt">$name-OverallMutationDistribution.txt</a> </center> </td>" >> $outfile +echo "<td> <center> <a href="Figures/Impact_protein_sequence/$name/$name-DistributionExoFunc.txt">$name-DistributionExoFunc.txt</a> </center> </td>" >> $outfile +echo "</tr><tr>" >> $outfile + +echo "<td>" >> $outfile +echo "<span class="legend"><img src="Figures/Overall_mutation_distribution/$name/$name-OverallMutationDistribution.png width="280""/>" >> $outfile +echo "<span class="legend-hidden">" >> $outfile +echo "<center><B>Overall Mutation Distribution</center></B><br/>Proportion of all mutation types (total count are indicated in parenthesis). For indels the counts are based on annotation retrieved from the database ExonicFunc.refGene<br/>" >> $outfile +echo "</td>" >> $outfile +echo "<td>" >> $outfile +echo "<span class="legend"><img src="Figures/Impact_protein_sequence/$name/$name-DistributionExoFunc.png width="400""/>" >> $outfile +echo "<span class="legend-hidden">" >> $outfile +echo "<center><B>Graph 1. Impact on protein sequence</center></B><br/>Impact of all mutations (SBS and Indel) on the protein sequence based on the ExonicFunc.refGene annotation<br/>" >> $outfile +echo "</td>" >> $outfile + +echo "</tr>" >> $outfile +echo "</table>" >> $outfile + +echo "<br/><br/>" >> $outfile + + +echo "<table>" >> $outfile +echo "<tr>" >> $outfile +echo "<th><h3>SBS distribution</h3></th>" >> $outfile +echo "<th><h3>Stranded distribution of SBS</h3></th>" >> $outfile +echo "</tr><tr>" >> $outfile +echo "<td> <center> <a href="Figures/SBS_distribution/$name/$name-SBS_distribution.txt">$name-SBS_distribution.txt</a> </center> </td>" >> $outfile +echo "<td> <center> <a href="Figures/Stranded_Analysis/$name/$name-StrandBias.txt">$name-StrandBias.txt</a> </center> </td>" >> $outfile +echo "</tr><tr>" >> $outfile + +echo "<td>" >> $outfile +echo "<span class="legend"><img src="Figures/SBS_distribution/$name/$name-SBS_distribution.png width="550""/>" >> $outfile +echo "<span class="legend-hidden">" >> $outfile +echo "<center><B>Graph 2. SBS distribution</center></B><br/>Proportion of each type of single base substitution (SBS)<br/>" >> $outfile +echo "</td>" >> $outfile +echo "<td>" >> $outfile +echo "<span class="legend"><img src="Figures/Stranded_Analysis/$name/$name-StrandBias.png width="400""/>" >> $outfile +echo "<span class="legend-hidden">" >> $outfile +echo "<center><B>Graph 3. Stranded distribution of SBS</center></B><br/>Count of the six substitution types on the transcribed and non-transcribed strand<br/>" >> $outfile +echo "</td>" >> $outfile + +echo "</tr>" >> $outfile +echo "</table>" >> $outfile + + +echo "<br/><br/>" >> $outfile + + +###################################################### +# Trinucleotide sequence context of SBS on genomic # +###################################################### +echo "<table>" >> $outfile +echo "<h3>Trinucleotide sequence context of SBS on the genomic sequence</h3>" >> $outfile +echo "<tr>" >> $outfile +echo "<td> <center> <a href="Figures/Trinucleotide_Sequence_Context/$name/$name-MutationSpectraPercent-Genomic.txt">$name-MutationSpectraPercent.txt</a> </center> </td>" >> $outfile +echo "<td> <center> <a href="Figures/Trinucleotide_Sequence_Context/$name/$name-HeatmapPercent-Genomic.txt">$name-HeatmapPercent-Genomic.txt</a> </center> </td>" >> $outfile +echo "</tr><tr>" >> $outfile + +echo "<td>" >> $outfile +echo "<span class="legend"><img src="Figures/Trinucleotide_Sequence_Context/$name/$name-MutationSpectraPercent-Genomic.png width="1000""/>" >> $outfile +echo "<span class="legend-hidden">" >> $outfile +echo "<center><B>Panel 1. Trinucleotide sequence context of SBS on the genomic sequence</center></B><br/>Proportion of the six substitution types with their trinucleotide sequence context (total number of mutation is shown in parenthesis)<br/>" >> $outfile +echo "</td>" >> $outfile +echo "<td>" >> $outfile +echo "<span class="legend"><img src="Figures/Trinucleotide_Sequence_Context/$name/$name-HeatmapPercent-Genomic.png width="250""/>" >> $outfile +echo "<span class="legend-hidden">" >> $outfile +echo "<center><B>Panel 1. Trinucleotide sequence context of SBS on the genomic sequence</center></B><br/>Proportion of the six substitution types with their trinucleotide sequence context<br/>" >> $outfile +echo "</td>" >> $outfile + +echo "</tr>" >> $outfile +echo "</table>" >> $outfile + + +echo "<br/><br/>" >> $outfile + + +############################################################## +# Trinucleotide sequence context of SBS on coding sequence # +############################################################## +echo "<table>" >> $outfile +echo "<h3>Stranded analysis of trinucleotide sequence context of SBS</h3>" >> $outfile +echo "<tr>" >> $outfile +echo "<td> <center> <a href="Figures/Stranded_Analysis/$name/$name-StrandedSignaturePercent.txt">$name-StrandedSignaturePercent.txt</a> </center> </td>" >> $outfile +echo "</tr><tr>" >> $outfile + +echo "<td>" >> $outfile +echo "<span class="legend"><img src="Figures/Stranded_Analysis/$name/$name-StrandedSignaturePercent.png width="1300""/>" >> $outfile +echo "<span class="legend-hidden">" >> $outfile +echo "<center><B>Panel 2. Stranded analysis of trinucleotide sequence context of SBS</center></B><br/>Proportion of SBS with their trinucleotide context considering the non-transcribed and transcribed strand<br/>" >> $outfile +echo "</td>" >> $outfile +echo "</tr>" >> $outfile +echo "</table>" >> $outfile + +echo "<br/><br/>" >> $outfile + + +############################################# +# Sequence logo generated with Weblogo3 # +############################################# +echo "<table>" >> $outfile +echo "<h3>Wider sequence context with Weblogo3</h3>" >> $outfile +# Legende de la figure : Panel 3. Wider sequence context on genomic strand generated with Weblogo3 + +# C>A +echo "<tr>" >> $outfile +if [[ ! -e "$output_dir/Mutational_Analysis/Figures/WebLogo/$name/$name-CA-Probability.png" ]]; then + echo "<td>WARNING: No sequence for C>A </br> </td>" >> $outfile +else + echo "<td><a href="Figures/WebLogo/$name/$name-CA.fa">$name-CA.fa</a><br/>" >> $outfile + echo "<img src="Figures/WebLogo/$name/$name-CA-Probability.png"/><br/></td>" >> $outfile +fi +# C>G +if [[ ! -e "$output_dir/Mutational_Analysis/Figures/WebLogo/$name/$name-CG-Probability.png" ]]; then + echo "<td> WARNING: No sequence for C>G </br> </td>" >> $outfile +else + echo "<td><a href="Figures/WebLogo/$name/$name-CG.fa">$name-CG.fa</a><br/>" >> $outfile + echo "<img src="Figures/WebLogo/$name/$name-CG-Probability.png"/><br/></td>" >> $outfile +fi +# C>T +if [[ ! -e "$output_dir/Mutational_Analysis/Figures/WebLogo/$name/$name-CT-Probability.png" ]]; then + echo "<td> WARNING: No sequence for C>T </br> </td>" >> $outfile +else + echo "<td><a href="Figures/WebLogo/$name/$name-CT.fa">$name-CT.fa</a><br/>" >> $outfile + echo "<img src="Figures/WebLogo/$name/$name-CT-Probability.png"/><br/></td>" >> $outfile +fi +echo "</tr>" >> $outfile + +# T>A +echo "<tr>" >> $outfile +if [[ ! -e "$output_dir/Mutational_Analysis/Figures/WebLogo/$name/$name-TA-Probability.png" ]]; then + echo "<td>WARNING: No sequence for T>A </br> </td>" >> $outfile +else + echo "<td><a href="Figures/WebLogo/$name/$name-TA.fa">$name-TA.fa</a><br/>" >> $outfile + echo "<img src="Figures/WebLogo/$name/$name-TA-Probability.png"/><br/></td>" >> $outfile +fi +# T>C +if [[ ! -e "$output_dir/Mutational_Analysis/Figures/WebLogo/$name/$name-TC-Probability.png" ]]; then + echo "<td>WARNING: No sequence for T>C </br> </td>" >> $outfile +else + echo "<td><a href="Figures/WebLogo/$name/$name-TC.fa">$name-TC.fa</a><br/>" >> $outfile + echo "<img src="Figures/WebLogo/$name/$name-TC-Probability.png"/><br/></td>" >> $outfile +fi +# T>G +if [[ ! -e "$output_dir/Mutational_Analysis/Figures/WebLogo/$name/$name-TG-Probability.png" ]]; then + echo "<td>WARNING: No sequence for T>G </br> </td>" >> $outfile +else + echo "<td><a href="Figures/WebLogo/$name/$name-TG.fa">$name-TG.fa</a><br/>" >> $outfile + echo "<img src="Figures/WebLogo/$name/$name-TG-Probability.png"/><br/></td>" >> $outfile +fi +echo "</tr>" >> $outfile + +echo "</table>" >> $outfile + +echo "</body></html>" >> $outfile + +done + +#-------------------------------------------------------------------------------------------------------------------------------------------------------------------------# +# POOL DATA # +#-------------------------------------------------------------------------------------------------------------------------------------------------------------------------# +## HMTL Link to Pool_Data +if [[ $parameters =~ "--pooldata" ]]; then + outfilePoolData="$output_dir/Mutational_Analysis/Pool_Data.html" + touch $outfilePoolData # Create an empty file named $outfile + echo "<a href='Mutational_Analysis/Pool_Data.html'>Pool_Data</a><br/>" >> $html + + + echo "<br/> <center> <h2>Mutational Spectra report for Pool_Data</h2> </center> <br/>" >> $outfilePoolData + echo "<html>" >> $outfilePoolData + + echo "<head>" >> $outfilePoolData + echo "<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">" >> $outfilePoolData + # Link to the css style file for having a legend when we pass the mouse on the figures + echo "<link rel="stylesheet" href="style.css" />" >> $outfilePoolData + echo "</head>" >> $outfilePoolData + + echo "<body>" >> $outfilePoolData + + echo "<table>" >> $outfilePoolData + echo "<tr>" >> $outfilePoolData + echo "<th><h3>Overall mutation distribution</h3></th>" >> $outfilePoolData + echo "<th><h3>Impact on protein sequence</h3></th>" >> $outfilePoolData + echo "</tr><tr>" >> $outfilePoolData + echo "<td> <center> <a href="Figures/Overall_mutation_distribution/Pool_Data/Pool_Data-OverallMutationDistribution.txt">Pool_Data-OverallMutationDistribution.txt</a> </center> </td>" >> $outfilePoolData + echo "<td> <center> <a href="Figures/Impact_protein_sequence/Pool_Data/Pool_Data-DistributionExoFunc.txt">Pool_Data-DistributionExoFunc.txt</a> </center> </td>" >> $outfilePoolData + echo "</tr><tr>" >> $outfilePoolData + + echo "<td>" >> $outfilePoolData + echo "<span class="legend"><img src="Figures/Overall_mutation_distribution/Pool_Data/Pool_Data-OverallMutationDistribution.png width="280""/>" >> $outfilePoolData + echo "<span class="legend-hidden">" >> $outfilePoolData + echo "<center><B>Overall Mutation Distribution</center></B><br/>Proportion of all mutation types (total count are indicated in parenthesis). For indels the counts are based on annotation retrieved from the database ExonicFunc.refGene<br/>" >> $outfilePoolData + echo "</td>" >> $outfilePoolData + echo "<td>" >> $outfilePoolData + echo "<span class="legend"><img src="Figures/Impact_protein_sequence/Pool_Data/Pool_Data-DistributionExoFunc.png width="400""/>" >> $outfilePoolData + echo "<span class="legend-hidden">" >> $outfilePoolData + echo "<center><B>Graph 1. Impact on protein sequence</center></B><br/>Impact of all mutations (SBS and Indel) on the protein sequence based on the ExonicFunc.refGene annotation<br/>" >> $outfilePoolData + echo "</td>" >> $outfilePoolData + + echo "</tr>" >> $outfilePoolData + echo "</table>" >> $outfilePoolData + + echo "<br/><br/>" >> $outfilePoolData + + + echo "<table>" >> $outfilePoolData + echo "<tr>" >> $outfilePoolData + echo "<th><h3>SBS distribution</h3></th>" >> $outfilePoolData + echo "<th><h3>Stranded distribution of SBS</h3></th>" >> $outfilePoolData + echo "</tr><tr>" >> $outfilePoolData + echo "<td> <center> <a href="Figures/SBS_distribution/Pool_Data/Pool_Data-SBS_distribution.txt">Pool_Data-SBS_distribution.txt</a> </center> </td>" >> $outfilePoolData + echo "<td> <center> <a href="Figures/Stranded_Analysis/Pool_Data/Pool_Data-StrandBias.txt">Pool_Data-StrandBias.txt</a> </center> </td>" >> $outfilePoolData + echo "</tr><tr>" >> $outfilePoolData + + echo "<td>" >> $outfilePoolData + echo "<span class="legend"><img src="Figures/SBS_distribution/Pool_Data/Pool_Data-SBS_distribution.png width="550""/>" >> $outfilePoolData + echo "<span class="legend-hidden">" >> $outfilePoolData + echo "<center><B>Graph 2. SBS distribution</center></B><br/>Proportion of each type of single base substitution (SBS)<br/>" >> $outfilePoolData + echo "</td>" >> $outfilePoolData + echo "<td>" >> $outfilePoolData + echo "<span class="legend"><img src="Figures/Stranded_Analysis/Pool_Data/Pool_Data-StrandBias.png width="400""/>" >> $outfilePoolData + echo "<span class="legend-hidden">" >> $outfilePoolData + echo "<center><B>Graph 3. Stranded distribution of SBS</center></B><br/>Count of the six substitution types on the transcribed and non-transcribed strand<br/>" >> $outfilePoolData + echo "</td>" >> $outfilePoolData + + echo "</tr>" >> $outfilePoolData + echo "</table>" >> $outfilePoolData + + + echo "<br/><br/>" >> $outfilePoolData + + + ########################################################## + # Trinucleotide sequence context of SBS on genomic: Pool # + ########################################################## + echo "<table>" >> $outfilePoolData + echo "<h3>Trinucleotide sequence context of SBS on the genomic sequence</h3>" >> $outfilePoolData + echo "<tr>" >> $outfilePoolData + echo "<td> <center> <a href="Figures/Trinucleotide_Sequence_Context/Pool_Data/Pool_Data-MutationSpectraPercent-Genomic.txt">Pool_Data-MutationSpectraPercent.txt</a> </center> </td>" >> $outfilePoolData + echo "<td> <center> <a href="Figures/Trinucleotide_Sequence_Context/Pool_Data/Pool_Data-HeatmapPercent-Genomic.txt">Pool_Data-HeatmapPercent-Genomic.txt</a> </center> </td>" >> $outfilePoolData + echo "</tr><tr>" >> $outfilePoolData + + echo "<td>" >> $outfilePoolData + echo "<span class="legend"><img src="Figures/Trinucleotide_Sequence_Context/Pool_Data/Pool_Data-MutationSpectraPercent-Genomic.png width="1000""/>" >> $outfilePoolData + echo "<span class="legend-hidden">" >> $outfilePoolData + echo "<center><B>Panel 1. Trinucleotide sequence context of SBS on the genomic sequence</center></B><br/>Proportion of the six substitution types with their trinucleotide sequence context (total number of mutation is shown in parenthesis)<br/>" >> $outfilePoolData + echo "</td>" >> $outfilePoolData + echo "<td>" >> $outfilePoolData + echo "<span class="legend"><img src="Figures/Trinucleotide_Sequence_Context/Pool_Data/Pool_Data-HeatmapPercent-Genomic.png width="250""/>" >> $outfilePoolData + echo "<span class="legend-hidden">" >> $outfilePoolData + echo "<center><B>Panel 1. Trinucleotide sequence context of SBS on the genomic sequence</center></B><br/>Proportion of the six substitution types with their trinucleotide sequence context<br/>" >> $outfilePoolData + echo "</td>" >> $outfilePoolData + + echo "</tr>" >> $outfilePoolData + echo "</table>" >> $outfilePoolData + + + echo "<br/><br/>" >> $outfilePoolData + + + ################################################################## + # Trinucleotide sequence context of SBS on coding sequence: Pool # + ################################################################## + echo "<table>" >> $outfilePoolData + echo "<h3>Stranded analysis of trinucleotide sequence context of SBS</h3>" >> $outfilePoolData + echo "<tr>" >> $outfilePoolData + echo "<td> <center> <a href="Figures/Stranded_Analysis/Pool_Data/Pool_Data-StrandedSignaturePercent.txt">Pool_Data-StrandedSignaturePercent.txt</a> </center> </td>" >> $outfilePoolData + echo "</tr><tr>" >> $outfilePoolData + + echo "<td>" >> $outfilePoolData + echo "<span class="legend"><img src="Figures/Stranded_Analysis/Pool_Data/Pool_Data-StrandedSignaturePercent.png width="1300""/>" >> $outfilePoolData + echo "<span class="legend-hidden">" >> $outfilePoolData + echo "<center><B>Panel 2. Stranded analysis of trinucleotide sequence context of SBS</center></B><br/>Proportion of SBS with their trinucleotide context considering the non-transcribed and transcribed strand<br/>" >> $outfilePoolData + echo "</td>" >> $outfilePoolData + echo "</tr>" >> $outfilePoolData + echo "</table>" >> $outfilePoolData + + echo "<br/><br/>" >> $outfilePoolData + + ##################################################### + # Sequence logo generated with Weblogo3: Pool # + ##################################################### + echo "<table>" >> $outfilePoolData + echo "<h3>Sequence logo generated with Weblogo3</h3>" >> $outfilePoolData + # C>A + echo "<tr>" >> $outfilePoolData + if [[ ! -e "$output_dir/Mutational_Analysis/Figures/WebLogo/Pool_Data/Pool_Data-CA-Probability.png" ]]; then + echo "<td>WARNING: No sequence for C>A </br> </td>" >> $outfilePoolData + else + echo "<td><a href="Figures/WebLogo/Pool_Data/Pool_Data-CA.fa">Pool_Data-CA.fa</a><br/>" >> $outfilePoolData + echo "<img src="Figures/WebLogo/Pool_Data/Pool_Data-CA-Probability.png"/><br/></td>" >> $outfilePoolData + fi + # C>G + if [[ ! -e "$output_dir/Mutational_Analysis/Figures/WebLogo/Pool_Data/Pool_Data-CG-Probability.png" ]]; then + echo "<td>WARNING: No sequence for C>G </br> </td>" >> $outfilePoolData + else + echo "<td><a href="Figures/WebLogo/Pool_Data/Pool_Data-CG.fa">Pool_Data-CG.fa</a><br/>" >> $outfilePoolData + echo "<img src="Figures/WebLogo/Pool_Data/Pool_Data-CG-Probability.png"/><br/></td>" >> $outfilePoolData + fi + # C>T + if [[ ! -e "$output_dir/Mutational_Analysis/Figures/WebLogo/Pool_Data/Pool_Data-CT-Probability.png" ]]; then + echo "<td>WARNING: No sequence for C>T </br> </td>" >> $outfilePoolData + else + echo "<td><a href="Figures/WebLogo/Pool_Data/Pool_Data-CT.fa">Pool_Data-CT.fa</a><br/>" >> $outfilePoolData + echo "<img src="Figures/WebLogo/Pool_Data/Pool_Data-CT-Probability.png"/><br/></td>" >> $outfilePoolData + fi + echo "</tr>" >> $outfilePoolData + + # T>A + echo "<tr>" >> $outfilePoolData + if [[ ! -e "$output_dir/Mutational_Analysis/Figures/WebLogo/Pool_Data/Pool_Data-TA-Probability.png" ]]; then + echo "<td>WARNING: No sequence for T>A </br> </td>" >> $outfilePoolData + else + echo "<td><a href="Figures/WebLogo/Pool_Data/Pool_Data-TA.fa">Pool_Data-TA.fa</a><br/>" >> $outfilePoolData + echo "<img src="Figures/WebLogo/Pool_Data/Pool_Data-TA-Probability.png"/><br/></td>" >> $outfilePoolData + fi + # T>C + if [[ ! -e "$output_dir/Mutational_Analysis/Figures/WebLogo/Pool_Data/Pool_Data-TC-Probability.png" ]]; then + echo "<td>WARNING: No sequence for T>C </br> </td>" >> $outfilePoolData + else + echo "<td><a href="Figures/WebLogo/Pool_Data/Pool_Data-TC.fa">Pool_Data-TC.fa</a><br/>" >> $outfilePoolData + echo "<img src="Figures/WebLogo/Pool_Data/Pool_Data-TC-Probability.png"/><br/></td>" >> $outfilePoolData + fi + # T>G + if [[ ! -e "$output_dir/Mutational_Analysis/Figures/WebLogo/Pool_Data/Pool_Data-TG-Probability.png" ]]; then + echo "<td>WARNING: No sequence for T>G </br> </td>" >> $outfilePoolData + else + echo "<td><a href="Figures/WebLogo/Pool_Data/Pool_Data-TG.fa">Pool_Data-TG.fa</a><br/>" >> $outfilePoolData + echo "<img src="Figures/WebLogo/Pool_Data/Pool_Data-TG-Probability.png"/><br/></td>" >> $outfilePoolData + fi + echo "</tr>" >> $outfilePoolData + echo "</table>" >> $outfilePoolData + + echo "</body></html>" >> $outfilePoolData + +fi # End if --poolData + +fi # End if [ -d $output_dir/Mutational_Analysis/Figures ] + +echo "</body></html>" >> $html + +exit 0 +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/annovar_index.loc.sample Tue Apr 19 03:07:11 2016 -0400 @@ -0,0 +1,7 @@ +# +# Database name (value), dbkey, type, and path. +# +# +#hg19 hg19 filter /home/galaxy/annovar/hg19db/ + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Tue Apr 19 03:07:11 2016 -0400 @@ -0,0 +1,7 @@ +<!-- ANNOVAR files --> +<tables> +<table name="annovar_index" comment_char="#"> +<columns>value, dbkey, type, path</columns> +<file path="tool-data/annovar_index.loc" /> +</table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Tue Apr 19 03:07:11 2016 -0400 @@ -0,0 +1,54 @@ +<?xml version="1.0"?> +<tool_dependency> + <set_environment version="1.0"> + <environment_variable action="set_to" name="SCRIPT_PATH">$REPOSITORY_INSTALL_DIR</environment_variable> + </set_environment> + + <package name="perl" version="5.18.1"> + <install version="1.0"> + <actions> + <action type="setup_perl_environment"> + <repository changeset_revision="35f117d7396b" name="package_perl_5_18" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu"> + <package name="perl" prior_installation_required="True" version="5.18.1" /> + </repository> + + <repository changeset_revision="4d2fd1413b56" name="package_r_3_1_2" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu"> + <package name="R" version="3.0.1" /> + </repository> + + <!-- allow downloading and installing an Perl package from cpan.org--> + <package>http://search.cpan.org/CPAN/authors/id/T/TO/TODDR/IPC-Run-0.94.tar.gz</package> + <package>http://search.cpan.org/CPAN/authors/id/A/AB/ABIGAIL/Regexp-Common-2013031301.tar.gz</package> + <package>http://search.cpan.org/CPAN/authors/id/F/FA/FANGLY/Statistics-R-0.33.tar.gz</package> + <package>http://search.cpan.org/CPAN/authors/id/J/JM/JMCNAMARA/OLE-Storage_Lite-0.19.tar.gz</package> + <package>http://search.cpan.org/CPAN/authors/id/J/JM/JMCNAMARA/Spreadsheet-WriteExcel-2.40.tar.gz</package> + <package>http://search.cpan.org/CPAN/authors/id/D/DL/DLUX/Parallel-ForkManager-0.7.5.tar.gz</package> + </action> + <action type="set_environment"> + <environment_variable action="prepend_to" name="PERL5LIB">$INSTALL_DIR/lib/perl5</environment_variable> + </action> + </actions> + </install> + </package> + + <package name="perl" version="5.18.1"> + <repository changeset_revision="35f117d7396b" name="package_perl_5_18" owner="iuc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + + <package name="weblogo" version="3.3"> + <repository changeset_revision="648e4b32f15c" name="package_weblogo_3_3" owner="devteam" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + + <package name="numpy" version="1.9"> + <repository changeset_revision="57b37f63cb84" name="package_python_2_7_numpy_1_9" owner="iuc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + + <package name="R" version="3.1.2"> + <repository changeset_revision="4d2fd1413b56" name="package_r_3_1_2" owner="iuc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + + <package name="mutspec" version="0.1"> + <repository changeset_revision="63cc1719e1aa" name="package_r_mutspec_0_1" owner="iarc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + +</tool_dependency>