Mercurial > repos > cstrittmatter > test_galtrakr_eurl_vtec_wgs_pt_23
annotate scripts/ReMatCh/rematch.py @ 5:f739b302bc9a draft default tip
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author | cstrittmatter |
---|---|
date | Wed, 22 Jan 2020 08:12:43 -0500 |
parents | e37910d2c794 |
children |
rev | line source |
---|---|
0
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
1 #!/usr/bin/env python |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
2 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
3 # -*- coding: utf-8 -*- |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
4 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
5 """ |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
6 rematch.py - Reads mapping against target sequences, checking mapping |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
7 and consensus sequences production |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
8 <https://github.com/B-UMMI/ReMatCh/> |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
9 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
10 Copyright (C) 2017 Miguel Machado <mpmachado@medicina.ulisboa.pt> |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
11 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
12 Last modified: April 12, 2017 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
13 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
14 This program is free software: you can redistribute it and/or modify |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
15 it under the terms of the GNU General Public License as published by |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
16 the Free Software Foundation, either version 3 of the License, or |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
17 (at your option) any later version. |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
18 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
19 This program is distributed in the hope that it will be useful, |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
20 but WITHOUT ANY WARRANTY; without even the implied warranty of |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
22 GNU General Public License for more details. |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
23 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
24 You should have received a copy of the GNU General Public License |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
25 along with this program. If not, see <http://www.gnu.org/licenses/>. |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
26 """ |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
27 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
28 import os |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
29 import sys |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
30 import time |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
31 import argparse |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
32 import modules.utils as utils |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
33 import modules.seqFromWebTaxon as seqFromWebTaxon |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
34 import modules.download as download |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
35 import modules.rematch_module as rematch_module |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
36 import modules.checkMLST as checkMLST |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
37 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
38 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
39 version = '3.2' |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
40 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
41 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
42 def searchFastqFiles(directory): |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
43 filesExtensions = ['.fastq.gz', '.fq.gz'] |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
44 pairEnd_filesSeparation = [['_R1_001.f', '_R2_001.f'], ['_1.f', '_2.f']] |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
45 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
46 listIDs = {} |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
47 directories = [d for d in os.listdir(directory) if not d.startswith('.') and os.path.isdir(os.path.join(directory, d, ''))] |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
48 for directory_found in directories: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
49 if directory_found != 'pubmlst': |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
50 directory_path = os.path.join(directory, directory_found, '') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
51 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
52 fastqFound = [] |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
53 files = [f for f in os.listdir(directory_path) if not f.startswith('.') and os.path.isfile(os.path.join(directory_path, f))] |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
54 for file_found in files: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
55 if file_found.endswith(tuple(filesExtensions)): |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
56 fastqFound.append(file_found) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
57 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
58 if len(fastqFound) == 1: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
59 listIDs[directory_found] = [os.path.join(directory_path, f) for f in fastqFound] |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
60 elif len(fastqFound) >= 2: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
61 file_pair = [] |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
62 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
63 # Search pairs |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
64 for PE_separation in pairEnd_filesSeparation: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
65 for fastq in fastqFound: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
66 if PE_separation[0] in fastq or PE_separation[1] in fastq: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
67 file_pair.append(fastq) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
68 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
69 if len(file_pair) == 2: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
70 break |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
71 else: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
72 file_pair = [] |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
73 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
74 # Search single |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
75 if len(file_pair) == 0: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
76 for PE_separation in pairEnd_filesSeparation: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
77 for fastq in fastqFound: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
78 if PE_separation[0] not in fastq or PE_separation[1] not in fastq: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
79 file_pair.append(fastq) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
80 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
81 if len(file_pair) >= 1: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
82 file_pair = file_pair[0] |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
83 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
84 if len(file_pair) >= 1: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
85 listIDs[directory_found] = [os.path.join(directory_path, f) for f in file_pair] |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
86 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
87 return listIDs |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
88 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
89 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
90 def getListIDs_fromFile(fileListIDs): |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
91 list_ids = [] |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
92 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
93 with open(fileListIDs, 'rtU') as lines: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
94 for line in lines: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
95 line = line.splitlines()[0] |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
96 if len(line) > 0: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
97 list_ids.append(line) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
98 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
99 if len(list_ids) == 0: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
100 sys.exit('No runIDs were found in ' + fileListIDs) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
101 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
102 return list_ids |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
103 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
104 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
105 def getTaxonRunIDs(taxon_name, outputfile): |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
106 seqFromWebTaxon.runSeqFromWebTaxon(taxon_name, outputfile, True, True, True, False) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
107 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
108 runIDs = [] |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
109 with open(outputfile, 'rtU') as reader: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
110 for line in reader: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
111 line = line.splitlines()[0] |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
112 if len(line) > 0: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
113 if not line.startswith('#'): |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
114 line = line.split('\t') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
115 runIDs.append(line[0]) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
116 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
117 return runIDs |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
118 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
119 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
120 def getListIDs(workdir, fileListIDs, taxon_name): |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
121 searched_fastq_files = False |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
122 listIDs = [] |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
123 if fileListIDs is None and taxon_name is None: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
124 listIDs = searchFastqFiles(workdir) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
125 searched_fastq_files = True |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
126 elif fileListIDs is not None: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
127 listIDs = getListIDs_fromFile(os.path.abspath(fileListIDs)) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
128 elif taxon_name is not None and fileListIDs is None: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
129 listIDs = getTaxonRunIDs(taxon_name, os.path.join(workdir, 'IDs_list.seqFromWebTaxon.tab')) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
130 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
131 if len(listIDs) == 0: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
132 sys.exit('No IDs were found') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
133 return listIDs, searched_fastq_files |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
134 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
135 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
136 def format_gene_info(gene_specific_info, minimum_gene_coverage, minimum_gene_identity, reported_data_type): |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
137 info = None |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
138 if gene_specific_info['gene_coverage'] >= minimum_gene_coverage and gene_specific_info['gene_identity'] >= minimum_gene_identity: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
139 if gene_specific_info['gene_number_positions_multiple_alleles'] == 0: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
140 info = str(gene_specific_info[reported_data_type]) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
141 else: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
142 info = 'multiAlleles_' + str(gene_specific_info[reported_data_type]) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
143 else: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
144 info = 'absent_' + str(gene_specific_info[reported_data_type]) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
145 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
146 return info |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
147 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
148 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
149 def write_data_by_gene(gene_list_reference, minimum_gene_coverage, sample, data_by_gene, outdir, time_str, run_times, minimum_gene_identity, reported_data_type): |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
150 combined_report = os.path.join(outdir, 'combined_report.data_by_gene.' + run_times + '.' + reported_data_type + '.' + time_str + '.tab') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
151 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
152 if reported_data_type == 'coverage_depth': |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
153 reported_data_type = 'gene_mean_read_coverage' |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
154 elif reported_data_type == 'sequence_coverage': |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
155 reported_data_type = 'gene_coverage' |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
156 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
157 combined_report_exist = os.path.isfile(combined_report) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
158 with open(combined_report, 'at') as writer: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
159 seq_list = gene_list_reference.keys() |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
160 if not combined_report_exist: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
161 writer.write('#sample' + '\t' + '\t'.join([gene_list_reference[seq] for seq in seq_list]) + '\n') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
162 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
163 results = {} |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
164 headers = [] |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
165 for i in data_by_gene: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
166 results[data_by_gene[i]['header']] = format_gene_info(data_by_gene[i], minimum_gene_coverage, minimum_gene_identity, reported_data_type) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
167 headers.append(data_by_gene[i]['header']) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
168 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
169 if len(headers) != gene_list_reference: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
170 for gene in gene_list_reference: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
171 if gene not in headers: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
172 results[gene] = 'NA' |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
173 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
174 writer.write(sample + '\t' + '\t'.join([results[seq] for seq in seq_list]) + '\n') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
175 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
176 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
177 def write_sample_report(sample, outdir, time_str, fileSize, run_successfully_fastq, run_successfully_rematch_first, run_successfully_rematch_second, time_taken_fastq, time_taken_rematch_first, time_taken_rematch_second, time_taken_sample, sequencingInformation, sample_data_general_first, sample_data_general_second, fastq_used): |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
178 sample_report = os.path.join(outdir, 'sample_report.' + time_str + '.tab') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
179 report_exist = os.path.isfile(sample_report) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
180 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
181 header_general = ['sample', 'sample_run_successfully', 'sample_run_time', 'files_size', 'download_run_successfully', 'download_run_time', 'rematch_run_successfully_first', 'rematch_run_time_first', 'rematch_run_successfully_second', 'rematch_run_time_second'] |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
182 header_data_general = ['number_absent_genes', 'number_genes_multiple_alleles', 'mean_sample_coverage'] |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
183 header_sequencing = ['run_accession', 'instrument_platform', 'instrument_model', 'library_layout', 'library_source', 'extra_run_accession', 'nominal_length', 'read_count', 'base_count', 'date_download'] |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
184 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
185 with open(sample_report, 'at') as writer: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
186 if not report_exist: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
187 writer.write('#' + '\t'.join(header_general) + '\t' + '_first\t'.join(header_data_general) + '_first\t' + '_second\t'.join(header_data_general) + '_second\t' + '\t'.join(header_sequencing) + '\t' + 'fastq_used' + '\n') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
188 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
189 writer.write('\t'.join([sample, str(all([run_successfully_fastq is not False, run_successfully_rematch_first is not False, run_successfully_rematch_second is not False])), str(time_taken_sample), str(fileSize), str(run_successfully_fastq), str(time_taken_fastq), str(run_successfully_rematch_first), str(time_taken_rematch_first), str(run_successfully_rematch_second), str(time_taken_rematch_second)]) + '\t' + '\t'.join([str(sample_data_general_first[i]) for i in header_data_general]) + '\t' + '\t'.join([str(sample_data_general_second[i]) for i in header_data_general]) + '\t' + '\t'.join([str(sequencingInformation[i]) for i in header_sequencing]) + '\t' + ','.join(fastq_used) + '\n') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
190 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
191 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
192 def concatenate_extraSeq_2_consensus(consensus_sequence, reference_sequence, extraSeq_length, outdir): |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
193 reference_dict, ignore, ignore = rematch_module.get_sequence_information(reference_sequence, extraSeq_length) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
194 consensus_dict, genes, ignore = rematch_module.get_sequence_information(consensus_sequence, 0) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
195 for k, values_consensus in consensus_dict.items(): |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
196 for values_reference in reference_dict.values(): |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
197 if values_reference['header'] == values_consensus['header']: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
198 if extraSeq_length <= len(values_reference['sequence']): |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
199 right_extra_seq = '' if extraSeq_length == 0 else values_reference['sequence'][-extraSeq_length:] |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
200 consensus_dict[k]['sequence'] = values_reference['sequence'][:extraSeq_length] + consensus_dict[k]['sequence'] + right_extra_seq |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
201 consensus_dict[k]['length'] += extraSeq_length + len(right_extra_seq) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
202 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
203 consensus_concatenated = os.path.join(outdir, 'consensus_concatenated_extraSeq.fasta') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
204 with open(consensus_concatenated, 'wt') as writer: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
205 for i in consensus_dict: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
206 writer.write('>' + consensus_dict[i]['header'] + '\n') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
207 fasta_sequence_lines = rematch_module.chunkstring(consensus_dict[i]['sequence'], 80) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
208 for line in fasta_sequence_lines: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
209 writer.write(line + '\n') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
210 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
211 return consensus_concatenated, genes, consensus_dict |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
212 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
213 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
214 def clean_headers_reference_file(reference_file, outdir, extraSeq): |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
215 problematic_characters = ["|", " ", ",", ".", "(", ")", "'", "/", ":"] |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
216 print 'Checking if reference sequences contain ' + str(problematic_characters) + '\n' |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
217 headers_changed = False |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
218 new_reference_file = str(reference_file) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
219 sequences, genes, headers_changed = rematch_module.get_sequence_information(reference_file, extraSeq) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
220 if headers_changed: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
221 print 'At least one of the those characters was found. Replacing those with _' + '\n' |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
222 new_reference_file = os.path.join(outdir, os.path.splitext(os.path.basename(reference_file))[0] + '.headers_renamed.fasta') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
223 with open(new_reference_file, 'wt') as writer: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
224 for i in sequences: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
225 writer.write('>' + sequences[i]['header'] + '\n') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
226 fasta_sequence_lines = rematch_module.chunkstring(sequences[i]['sequence'], 80) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
227 for line in fasta_sequence_lines: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
228 writer.write(line + '\n') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
229 return new_reference_file, genes, sequences |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
230 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
231 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
232 def write_mlst_report(sample, run_times, consensus_type, st, alleles_profile, lociOrder, outdir, time_str): |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
233 mlst_report = os.path.join(outdir, 'mlst_report.' + time_str + '.tab') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
234 mlst_report_exist = os.path.isfile(mlst_report) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
235 with open(mlst_report, 'at') as writer: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
236 if not mlst_report_exist: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
237 writer.write('\t'.join(['#sample', 'ReMatCh_run', 'consensus_type', 'ST'] + lociOrder) + '\n') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
238 writer.write('\t'.join([sample, run_times, consensus_type, str(st)] + alleles_profile.split(',')) + '\n') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
239 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
240 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
241 def run_get_st(sample, mlst_dicts, consensus_sequences, mlstConsensus, run_times, outdir, time_str): |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
242 if mlstConsensus == 'all': |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
243 for consensus_type in consensus_sequences: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
244 print 'Searching MLST for ' + consensus_type + ' consensus' |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
245 st, alleles_profile = checkMLST.getST(mlst_dicts, consensus_sequences[consensus_type]) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
246 write_mlst_report(sample, run_times, consensus_type, st, alleles_profile, mlst_dicts[2], outdir, time_str) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
247 print 'ST found: ' + str(st) + ' (' + alleles_profile + ')' |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
248 else: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
249 st, alleles_profile = checkMLST.getST(mlst_dicts, consensus_sequences[mlstConsensus]) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
250 write_mlst_report(sample, run_times, mlstConsensus, st, alleles_profile, mlst_dicts[2], outdir, time_str) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
251 print 'ST found for ' + mlstConsensus + ' consensus: ' + str(st) + ' (' + alleles_profile + ')' |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
252 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
253 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
254 def runRematch(args): |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
255 workdir = os.path.abspath(args.workdir) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
256 if not os.path.isdir(workdir): |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
257 os.makedirs(workdir) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
258 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
259 asperaKey = os.path.abspath(args.asperaKey.name) if args.asperaKey is not None else None |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
260 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
261 # Start logger |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
262 logfile, time_str = utils.start_logger(workdir) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
263 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
264 # Get general information |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
265 script_path = utils.general_information(logfile, version, workdir, time_str, args.doNotUseProvidedSoftware, asperaKey, args.downloadCramBam) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
266 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
267 # Set listIDs |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
268 listIDs, searched_fastq_files = getListIDs(workdir, args.listIDs.name if args.listIDs is not None else None, args.taxon) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
269 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
270 if args.mlst is not None: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
271 time_taken_PubMLST, mlst_dicts, mlst_sequences = checkMLST.downloadPubMLSTxml(args.mlst, args.mlstSchemaNumber, workdir) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
272 args.softClip_recodeRun = 'first' |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
273 args.conservedSeq = False |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
274 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
275 if args.reference is None: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
276 reference_file = checkMLST.check_existing_schema(args.mlst, args.mlstSchemaNumber, script_path) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
277 args.extraSeq = 200 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
278 if reference_file is None: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
279 print 'It was not found provided MLST scheme sequences for ' + args.mlst |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
280 print 'Trying to obtain reference MLST sequences from PubMLST' |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
281 if len(mlst_sequences) > 0: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
282 reference_file = checkMLST.write_mlst_reference(args.mlst, mlst_sequences, workdir, time_str) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
283 args.extraSeq = 0 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
284 else: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
285 sys.exit('It was not possible to download MLST sequences from PubMLST!') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
286 else: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
287 print 'Using provided scheme as referece: ' + reference_file |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
288 else: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
289 reference_file = os.path.abspath(args.reference.name) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
290 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
291 # Run ReMatCh for each sample |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
292 print '\n' + 'STARTING ReMatCh' + '\n' |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
293 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
294 # Clean sequences headers |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
295 reference_file, gene_list_reference, reference_dict = clean_headers_reference_file(reference_file, workdir, args.extraSeq) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
296 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
297 if args.mlst is not None: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
298 problem_genes = False |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
299 for header in mlst_sequences: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
300 if header not in gene_list_reference: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
301 print 'MLST gene {header} not found between reference sequences'.format(header=header) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
302 problem_genes = True |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
303 if problem_genes: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
304 sys.exit('Missing MLST genes from reference sequences (at least sequences names do not match)!') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
305 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
306 if len(gene_list_reference) == 0: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
307 sys.exit('No sequences left') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
308 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
309 # To use in combined report |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
310 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
311 number_samples_successfully = 0 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
312 for sample in listIDs: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
313 sample_start_time = time.time() |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
314 print '\n\n' + 'Sample ID: ' + sample |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
315 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
316 # Create sample outdir |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
317 sample_outdir = os.path.join(workdir, sample, '') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
318 if not os.path.isdir(sample_outdir): |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
319 os.mkdir(sample_outdir) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
320 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
321 run_successfully_fastq = None |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
322 time_taken_fastq = 0 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
323 sequencingInformation = {'run_accession': None, 'instrument_platform': None, 'instrument_model': None, 'library_layout': None, 'library_source': None, 'extra_run_accession': None, 'nominal_length': None, 'read_count': None, 'base_count': None, 'date_download': None} |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
324 if not searched_fastq_files: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
325 # Download Files |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
326 time_taken_fastq, run_successfully_fastq, fastq_files, sequencingInformation = download.runDownload(sample, args.downloadLibrariesType, asperaKey, sample_outdir, args.downloadCramBam, args.threads, args.downloadInstrumentPlatform) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
327 else: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
328 fastq_files = listIDs[sample] |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
329 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
330 fileSize = None |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
331 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
332 run_successfully_rematch_first = None |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
333 run_successfully_rematch_second = None |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
334 time_taken_rematch_first = 0 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
335 time_taken_rematch_second = 0 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
336 if run_successfully_fastq is not False: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
337 fileSize = sum(os.path.getsize(fastq) for fastq in fastq_files) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
338 # Run ReMatCh |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
339 time_taken_rematch_first, run_successfully_rematch_first, data_by_gene, sample_data_general_first, consensus_files, consensus_sequences = rematch_module.runRematchModule(sample, fastq_files, reference_file, args.threads, sample_outdir, args.extraSeq, args.minCovPresence, args.minCovCall, args.minFrequencyDominantAllele, args.minGeneCoverage, args.conservedSeq, args.debug, args.numMapLoc, args.minGeneIdentity, 'first', args.softClip_baseQuality, args.softClip_recodeRun, reference_dict, args.softClip_cigarFlagRecode, args.bowtieOPT, gene_list_reference, args.notWriteConsensus) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
340 if run_successfully_rematch_first: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
341 if args.mlst is not None and (args.mlstRun == 'first' or args.mlstRun == 'all'): |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
342 run_get_st(sample, mlst_dicts, consensus_sequences, args.mlstConsensus, 'first', workdir, time_str) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
343 write_data_by_gene(gene_list_reference, args.minGeneCoverage, sample, data_by_gene, workdir, time_str, 'first_run', args.minGeneIdentity, 'coverage_depth') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
344 if args.reportSequenceCoverage: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
345 write_data_by_gene(gene_list_reference, args.minGeneCoverage, sample, data_by_gene, workdir, time_str, 'first_run', args.minGeneIdentity, 'sequence_coverage') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
346 if args.doubleRun: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
347 rematch_second_outdir = os.path.join(sample_outdir, 'rematch_second_run', '') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
348 if not os.path.isdir(rematch_second_outdir): |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
349 os.mkdir(rematch_second_outdir) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
350 consensus_concatenated_fasta, consensus_concatenated_gene_list, consensus_concatenated_dict = concatenate_extraSeq_2_consensus(consensus_files['noMatter'], reference_file, args.extraSeq, rematch_second_outdir) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
351 if len(consensus_concatenated_gene_list) > 0: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
352 time_taken_rematch_second, run_successfully_rematch_second, data_by_gene, sample_data_general_second, consensus_files, consensus_sequences = rematch_module.runRematchModule(sample, fastq_files, consensus_concatenated_fasta, args.threads, rematch_second_outdir, args.extraSeq, args.minCovPresence, args.minCovCall, args.minFrequencyDominantAllele, args.minGeneCoverage, args.conservedSeq, args.debug, args.numMapLoc, args.minGeneIdentity, 'second', args.softClip_baseQuality, args.softClip_recodeRun, consensus_concatenated_dict, args.softClip_cigarFlagRecode, args.bowtieOPT, gene_list_reference, args.notWriteConsensus) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
353 if not args.debug: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
354 os.remove(consensus_concatenated_fasta) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
355 if run_successfully_rematch_second: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
356 if args.mlst is not None and (args.mlstRun == 'second' or args.mlstRun == 'all'): |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
357 run_get_st(sample, mlst_dicts, consensus_sequences, args.mlstConsensus, 'second', workdir, time_str) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
358 write_data_by_gene(gene_list_reference, args.minGeneCoverage, sample, data_by_gene, workdir, time_str, 'second_run', args.minGeneIdentity, 'coverage_depth') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
359 if args.reportSequenceCoverage: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
360 write_data_by_gene(gene_list_reference, args.minGeneCoverage, sample, data_by_gene, workdir, time_str, 'second_run', args.minGeneIdentity, 'sequence_coverage') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
361 else: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
362 print 'No sequences left after ReMatCh module first run. Second run will not be performed' |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
363 if os.path.isfile(consensus_concatenated_fasta): |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
364 os.remove(consensus_concatenated_fasta) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
365 if os.path.isdir(rematch_second_outdir): |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
366 utils.removeDirectory(rematch_second_outdir) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
367 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
368 if not searched_fastq_files and not args.keepDownloadedFastq and fastq_files is not None: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
369 for fastq in fastq_files: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
370 if os.path.isfile(fastq): |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
371 os.remove(fastq) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
372 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
373 time_taken = utils.runTime(sample_start_time) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
374 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
375 write_sample_report(sample, workdir, time_str, fileSize, run_successfully_fastq, run_successfully_rematch_first, run_successfully_rematch_second, time_taken_fastq, time_taken_rematch_first, time_taken_rematch_second, time_taken, sequencingInformation, sample_data_general_first if run_successfully_rematch_first else {'number_absent_genes': None, 'number_genes_multiple_alleles': None, 'mean_sample_coverage': None}, sample_data_general_second if run_successfully_rematch_second else {'number_absent_genes': None, 'number_genes_multiple_alleles': None, 'mean_sample_coverage': None}, fastq_files if fastq_files is not None else '') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
376 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
377 if all([run_successfully_fastq is not False, run_successfully_rematch_first is not False, run_successfully_rematch_second is not False]): |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
378 number_samples_successfully += 1 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
379 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
380 return number_samples_successfully, len(listIDs) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
381 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
382 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
383 def main(): |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
384 parser = argparse.ArgumentParser(prog='rematch.py', description='Reads mapping against target sequences, checking mapping and consensus sequences production', formatter_class=argparse.ArgumentDefaultsHelpFormatter) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
385 parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
386 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
387 parser_optional_general = parser.add_argument_group('General facultative options') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
388 parser_optional_general.add_argument('-r', '--reference', type=argparse.FileType('r'), metavar='/path/to/reference_sequence.fasta', help='Fasta file containing reference sequences', required=False) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
389 parser_optional_general.add_argument('-w', '--workdir', type=str, metavar='/path/to/workdir/directory/', help='Path to the directory where ReMatCh will run and produce the outputs with reads (ended with fastq.gz/fq.gz and, in case of PE data, pair-end direction coded as _R1_001 / _R2_001 or _1 / _2) already present (organized in sample folders) or to be downloaded', required=False, default='.') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
390 parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N', help='Number of threads to use', required=False, default=1) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
391 parser_optional_general.add_argument('--mlst', type=str, metavar='"Streptococcus agalactiae"', help='Species name (same as in PubMLST) to be used in MLST determination. ReMatCh will use Bowtie2 very-sensitive-local mapping parameters and will recode the soft clip CIGAR flags of the first run', required=False) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
392 parser_optional_general.add_argument('--doNotUseProvidedSoftware', action='store_true', help='Tells ReMatCh to not use Bowtie2, Samtools and Bcftools that are provided with it') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
393 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
394 parser_optional_rematch = parser.add_argument_group('ReMatCh module facultative options') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
395 parser_optional_rematch.add_argument('--conservedSeq', action='store_true', help=argparse.SUPPRESS) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
396 # parser_optional_rematch.add_argument('--conservedSeq', action='store_true', help='This option can be used with conserved sequences like MLST genes to speedup the analysis by alignning reads using Bowtie2 sensitive algorithm') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
397 parser_optional_rematch.add_argument('--extraSeq', type=int, metavar='N', help='Sequence length added to both ends of target sequences (usefull to improve reads mapping to the target one) that will be trimmed in ReMatCh outputs', required=False, default=0) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
398 parser_optional_rematch.add_argument('--minCovPresence', type=int, metavar='N', help='Reference position minimum coverage depth to consider the position to be present in the sample', required=False, default=5) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
399 parser_optional_rematch.add_argument('--minCovCall', type=int, metavar='N', help='Reference position minimum coverage depth to perform a base call. Lower coverage will be coded as N', required=False, default=10) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
400 parser_optional_rematch.add_argument('--minFrequencyDominantAllele', type=float, metavar='0.6', help='Minimum relative frequency of the dominant allele coverage depth (value between [0, 1]). Positions with lower values will be considered as having multiple alleles (and will be coded as N)', required=False, default=0.6) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
401 parser_optional_rematch.add_argument('--minGeneCoverage', type=int, metavar='N', help='Minimum percentage of target reference gene sequence covered by --minCovPresence to consider a gene to be present (value between [0, 100])', required=False, default=70) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
402 parser_optional_rematch.add_argument('--minGeneIdentity', type=int, metavar='N', help='Minimum percentage of identity of reference gene sequence covered by --minCovCall to consider a gene to be present (value between [0, 100]). One INDEL will be considered as one difference', required=False, default=80) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
403 parser_optional_rematch.add_argument('--numMapLoc', type=int, metavar='N', help=argparse.SUPPRESS, required=False, default=1) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
404 # parser_optional_rematch.add_argument('--numMapLoc', type=int, metavar='N', help='Maximum number of locations to which a read can map (sometimes useful when mapping against similar sequences)', required=False, default=1) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
405 parser_optional_rematch.add_argument('--doubleRun', action='store_true', help='Tells ReMatCh to run a second time using as reference the noMatter consensus sequence produced in the first run. This will improve consensus sequence determination for sequences with high percentage of target reference gene sequence covered') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
406 parser_optional_rematch.add_argument('--reportSequenceCoverage', action='store_true', help='Produce an extra combined_report.data_by_gene with the sequence coverage instead of coverage depth') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
407 parser_optional_rematch.add_argument('--notWriteConsensus', action='store_true', help='Do not write consensus sequences') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
408 parser_optional_rematch.add_argument('--bowtieOPT', type=str, metavar='"--no-mixed"', help='Extra Bowtie2 options', required=False) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
409 parser_optional_rematch.add_argument('--debug', action='store_true', help='DeBug Mode: do not remove temporary files') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
410 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
411 parser_optional_mlst = parser.add_argument_group('MLST facultative options') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
412 parser_optional_rematch.add_argument('--mlstReference', action='store_true', help='If the curated scheme for MLST alleles is available, tells ReMatCh to use these as reference (force Bowtie2 to run with very-sensitive-local parameters, and sets --extraSeq to 200), otherwise ReMatCh uses the first alleles of each MLST gene fragment in PubMLST as reference sequences (force Bowtie2 to run with very-sensitive-local parameters, and sets --extraSeq to 0)') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
413 parser_optional_mlst.add_argument('--mlstSchemaNumber', type=int, metavar='N', help='Number of the species PubMLST schema to be used in case of multiple schemes available (by default will use the first schema)', required=False) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
414 parser_optional_mlst.add_argument('--mlstConsensus', choices=['noMatter', 'correct', 'alignment', 'all'], type=str, metavar='noMatter', help='Consensus sequence to be used in MLST determination (available options: %(choices)s)', required=False, default='noMatter') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
415 parser_optional_mlst.add_argument('--mlstRun', choices=['first', 'second', 'all'], type=str, metavar='first', help='ReMatCh run outputs to be used in MLST determination (available options: %(choices)s)', required=False, default='all') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
416 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
417 parser_optional_download = parser.add_argument_group('Download facultative options') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
418 parser_optional_download.add_argument('-a', '--asperaKey', type=argparse.FileType('r'), metavar='/path/to/asperaweb_id_dsa.openssh', help='Tells ReMatCh to download fastq files from ENA using Aspera Connect. With this option, the path to Private-key file asperaweb_id_dsa.openssh must be provided (normaly found in ~/.aspera/connect/etc/asperaweb_id_dsa.openssh).', required=False) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
419 parser_optional_download.add_argument('-k', '--keepDownloadedFastq', action='store_true', help='Tells ReMatCh to keep the fastq files downloaded') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
420 parser_optional_download.add_argument('--downloadLibrariesType', type=str, metavar='PAIRED', help='Tells ReMatCh to download files with specific library layout (available options: %(choices)s)', choices=['PAIRED', 'SINGLE', 'BOTH'], required=False, default='BOTH') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
421 parser_optional_download.add_argument('--downloadInstrumentPlatform', type=str, metavar='ILLUMINA', help='Tells ReMatCh to download files with specific library layout (available options: %(choices)s)', choices=['ILLUMINA', 'ALL'], required=False, default='ILLUMINA') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
422 parser_optional_download.add_argument('--downloadCramBam', action='store_true', help='Tells ReMatCh to also download cram/bam files and convert them to fastq files') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
423 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
424 parser_optional_softClip = parser.add_argument_group('Soft clip facultative options') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
425 parser_optional_softClip.add_argument('--softClip_baseQuality', type=int, metavar='N', help='Base quality phred score in reads soft clipped regions', required=False, default=7) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
426 parser_optional_download.add_argument('--softClip_recodeRun', type=str, metavar='first', help='ReMatCh run to recode soft clipped regions (available options: %(choices)s)', choices=['first', 'second', 'both', 'none'], required=False, default='none') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
427 parser_optional_download.add_argument('--softClip_cigarFlagRecode', type=str, metavar='M', help='CIGAR flag to recode CIGAR soft clip (available options: %(choices)s)', choices=['M', 'I', 'X'], required=False, default='X') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
428 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
429 parser_optional_download_exclusive = parser.add_mutually_exclusive_group() |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
430 parser_optional_download_exclusive.add_argument('-l', '--listIDs', type=argparse.FileType('r'), metavar='/path/to/list_IDs.txt', help='Path to list containing the IDs to be downloaded (one per line)', required=False) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
431 parser_optional_download_exclusive.add_argument('-t', '--taxon', type=str, metavar='"Streptococcus agalactiae"', help='Taxon name for which ReMatCh will download fastq files', required=False) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
432 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
433 args = parser.parse_args() |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
434 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
435 if args.reference is None and not args.mlstReference: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
436 parser.error('At least --reference or --mlstReference should be provided') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
437 elif args.reference is not None and args.mlstReference: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
438 parser.error('Only --reference or --mlstReference should be provided') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
439 else: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
440 if args.mlstReference: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
441 if args.mlst is None: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
442 parser.error('Please provide species name using --mlst') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
443 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
444 if args.minFrequencyDominantAllele < 0 or args.minFrequencyDominantAllele > 1: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
445 parser.error('--minFrequencyDominantAllele should be a value between [0, 1]') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
446 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
447 if args.minGeneCoverage < 0 or args.minGeneCoverage > 100: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
448 parser.error('--minGeneCoverage should be a value between [0, 100]') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
449 if args.minGeneIdentity < 0 or args.minGeneIdentity > 100: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
450 parser.error('--minGeneIdentity should be a value between [0, 100]') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
451 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
452 start_time = time.time() |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
453 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
454 number_samples_successfully, samples_total_number = runRematch(args) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
455 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
456 print '\n' + 'END ReMatCh' |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
457 print '\n' + str(number_samples_successfully) + ' samples out of ' + str(samples_total_number) + ' run successfully' |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
458 time_taken = utils.runTime(start_time) |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
459 del time_taken |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
460 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
461 if number_samples_successfully == 0: |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
462 sys.exit('No samples run successfully!') |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
463 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
464 |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
465 if __name__ == "__main__": |
e37910d2c794
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
466 main() |