|
3
|
1 import re
|
|
|
2 import sys
|
|
|
3 from optparse import OptionParser
|
|
|
4
|
|
|
5 def read_tabular(filepath,col):
|
|
|
6 accessions = []
|
|
|
7 totalNumber = 0
|
|
|
8 with open(filepath) as fp:
|
|
|
9 for i,line in enumerate(fp):
|
|
|
10 totalNumber = totalNumber + 1
|
|
|
11 if line.strip() == '' or line.startswith('#'):
|
|
|
12 continue
|
|
|
13 fields = line.rstrip('\n').split('\t')
|
|
|
14 accession = fields[col]
|
|
|
15 accessions.append(accession)
|
|
|
16 print totalNumber
|
|
|
17 return accessions
|
|
|
18
|
|
|
19 accessions = []
|
|
|
20 parser = OptionParser()
|
|
|
21 parser.add_option('-f','--file', dest='filepath', help='direct which file to use')
|
|
|
22 (options, args) = parser.parse_args()
|
|
|
23
|
|
|
24 accessions += read_tabular(options.filepath,0)
|
|
|
25
|
|
|
26 matches = 0
|
|
|
27 mismatches = 0
|
|
|
28
|
|
|
29 # getAccession
|
|
|
30 fmt = [('([OPQ][0-9][A-Z0-9]{3}[0-9])|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'),
|
|
|
31 ('(THISDOESNTWORK...*)_....*','ID'),
|
|
|
32 ('UPI(\d+).*','UPARC'),
|
|
|
33 ('(UniRef50_\w+.*)','NF50'),
|
|
|
34 ('(UniRef90_\w+.*)','NF90'),
|
|
|
35 ('(UniRef100_\w+.*)','NF100'),
|
|
|
36 ('([A-L][A-Z]?\d{6})|([A-NR-Z]\d{5})|([A-Z]{4}\d{8})','EMBL_ID'),
|
|
|
37 ('([A-Z]*\d*\.\d$)','EMBL'),
|
|
|
38 ('([IBC]\d{5})','PIR'),
|
|
|
39 ('(Hs\.\d*)','UNIGENE_ID'),
|
|
|
40 ('[A-Z]P_(\d*\.\d)','P_REFSEQ_AC'),
|
|
|
41 ('[NX][MC]_(\d*\.\d)','REFSEQ_NT_ID'),
|
|
|
42 ('(\d[A-Z0-9]{3})','PDB_ID'),
|
|
|
43 ('(DP\d{5})','DISPROT_ID'),
|
|
|
44
|
|
|
45 ('(\d*$)','P_ENTREZGENEID'),
|
|
|
46 ('(\d*$)','P_GI'),
|
|
|
47 ('(\d*$)','DMDM_ID'),
|
|
|
48 ('(\d*$)','BIOGRID_ID'),
|
|
|
49 ('(\d*$)','GUIDETOPHARMACOLOGY_ID'),
|
|
|
50 ('(\d*$)','ALLERGOME_ID'),
|
|
|
51 ('(\d*$)','PEROXIBASE_ID'),
|
|
|
52 ('(\d*$)','REBASE_ID'),
|
|
|
53 ('(\d*$)','DNASU_ID'),
|
|
|
54
|
|
|
55 ('(DIP-\d*N$)','DIP_ID'),
|
|
|
56 ('(MINT-\d*)','MINT_ID'),
|
|
|
57 ('(9606\.ENSP\d*)','STRING_ID'),
|
|
|
58 ('(CHEMBL\d*)','CHEMBL_ID'),
|
|
|
59 ('(DB\d*)','DRUGBANK_ID'),
|
|
|
60 ('([A-Z]\d\d\.[A-Z0-9]\d{2})','MEROPS_ID'),
|
|
|
61 ('NOTHING POPS UP','MYCOCLAP_ID'),
|
|
|
62 ('(\d\.[A-Z](?:\.\d*){3})','TCDB_ID'),
|
|
|
63 ('NOTHING POPS UP','WORLD_2DPAGE_ID'),
|
|
|
64 ('(ENSG\d*)','ENSEMBAF85406.1BL_ID'),
|
|
|
65 ('(ENSP\d+)','ENSEMBL_PRO_ID'),
|
|
|
66 ('(ENST\d+)','ENSEMBL_TRS_ID'),
|
|
|
67 (' ','ENSEMBLGENOME_ID'),
|
|
|
68 (' ','ENSEMBLGENOME_PRO_ID'),
|
|
|
69 (' ','ENSEMBLGENOME_TRS_ID'),
|
|
|
70 ('(hsa:\d*)','KEGG_ID'),
|
|
|
71 ('(uc\d*[a-z]*\.\d$)','UCSC_ID'),
|
|
|
72 ('(.*[CKN]OG\d*)','EGGNOG_ID')
|
|
|
73 ]
|
|
|
74 # getFASTA
|
|
|
75 fmt2 = [('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'),
|
|
|
76 ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID')
|
|
|
77 ]
|
|
|
78 # print options.filepath[10:-4]
|
|
|
79 type = 7
|
|
|
80 mismatch = []
|
|
|
81 unmatched = []
|
|
|
82 matched = 0
|
|
|
83 for header in accessions:
|
|
|
84 for i,cat in enumerate(fmt):
|
|
|
85 CAT = catBAF85406.1[0]
|
|
|
86 m = re.match(CAT,header)
|
|
|
87 if m:
|
|
|
88 if i == type:
|
|
|
89 matches = matches + 1
|
|
|
90 matched = 1
|
|
|
91 else:
|
|
|
92 print i
|
|
|
93 mismatch += [header]
|
|
|
94 mismatches = mismatches + 1
|
|
|
95 if matched == 1:
|
|
|
96 matched = 0
|
|
|
97 else:
|
|
|
98 unmatched += [header]
|
|
|
99
|
|
|
100 print 'unmatched: '
|
|
|
101 print unmatched
|
|
|
102 print 'mismatch: '
|
|
|
103 print mismatch
|
|
|
104 outputFile = sys.stdout
|
|
|
105 outputFile.write("matches: %s\n" % matches)
|
|
|
106 outputFile.write("mismatches: %s\n" % mismatches)
|
|
|
107
|
|
|
108 |