Mercurial > repos > davidvanzessen > shm_csr
comparison baseline/script_imgt.py @ 81:b6f9a640e098 draft
Uploaded
author | davidvanzessen |
---|---|
date | Fri, 19 Feb 2021 15:10:54 +0000 |
parents | |
children | 729738462297 |
comparison
equal
deleted
inserted
replaced
80:a4617f1d1d89 | 81:b6f9a640e098 |
---|---|
1 #import xlrd #avoid dep | |
2 import argparse | |
3 import re | |
4 | |
5 parser = argparse.ArgumentParser() | |
6 parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence") | |
7 parser.add_argument("--ref", help="Reference file") | |
8 parser.add_argument("--output", help="Output file") | |
9 parser.add_argument("--id", help="ID to be used at the '>>>' line in the output") | |
10 | |
11 args = parser.parse_args() | |
12 | |
13 print "script_imgt.py" | |
14 print "input:", args.input | |
15 print "ref:", args.ref | |
16 print "output:", args.output | |
17 print "id:", args.id | |
18 | |
19 refdic = dict() | |
20 with open(args.ref, 'rU') as ref: | |
21 currentSeq = "" | |
22 currentId = "" | |
23 for line in ref: | |
24 if line.startswith(">"): | |
25 if currentSeq is not "" and currentId is not "": | |
26 refdic[currentId[1:]] = currentSeq | |
27 currentId = line.rstrip() | |
28 currentSeq = "" | |
29 else: | |
30 currentSeq += line.rstrip() | |
31 refdic[currentId[1:]] = currentSeq | |
32 | |
33 print "Have", str(len(refdic)), "reference sequences" | |
34 | |
35 vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#, | |
36 # r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)", | |
37 # r"(IGKV[0-3]D?-[0-9]{1,2})", | |
38 # r"(IGLV[0-9]-[0-9]{1,2})", | |
39 # r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)", | |
40 # r"(TRGV[234589])", | |
41 # r"(TRDV[1-3])"] | |
42 | |
43 #vPattern = re.compile(r"|".join(vPattern)) | |
44 vPattern = re.compile("|".join(vPattern)) | |
45 | |
46 def filterGene(s, pattern): | |
47 if type(s) is not str: | |
48 return None | |
49 res = pattern.search(s) | |
50 if res: | |
51 return res.group(0) | |
52 return None | |
53 | |
54 | |
55 | |
56 currentSeq = "" | |
57 currentId = "" | |
58 first=True | |
59 with open(args.input, 'r') as i: | |
60 with open(args.output, 'a') as o: | |
61 o.write(">>>" + args.id + "\n") | |
62 outputdic = dict() | |
63 for line in i: | |
64 if first: | |
65 first = False | |
66 continue | |
67 linesplt = line.split("\t") | |
68 ref = filterGene(linesplt[1], vPattern) | |
69 if not ref or not linesplt[2].rstrip(): | |
70 continue | |
71 if ref in outputdic: | |
72 outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())] | |
73 else: | |
74 outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())] | |
75 #print outputdic | |
76 | |
77 for k in outputdic.keys(): | |
78 if k in refdic: | |
79 o.write(">>" + k + "\n") | |
80 o.write(refdic[k] + "\n") | |
81 for seq in outputdic[k]: | |
82 #print seq | |
83 o.write(">" + seq[0] + "\n") | |
84 o.write(seq[1] + "\n") | |
85 else: | |
86 print k + " not in reference, skipping " + k |