Mercurial > repos > rsajulga > uniprot_id_mapper
comparison uniprot_ID_mapper/test.py @ 3:6651ac4651f0 draft default tip
Uploaded
| author | rsajulga |
|---|---|
| date | Thu, 16 Jul 2015 16:41:35 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 2:d0311668d442 | 3:6651ac4651f0 |
|---|---|
| 1 import re | |
| 2 import sys | |
| 3 from optparse import OptionParser | |
| 4 | |
| 5 def read_tabular(filepath,col): | |
| 6 accessions = [] | |
| 7 totalNumber = 0 | |
| 8 with open(filepath) as fp: | |
| 9 for i,line in enumerate(fp): | |
| 10 totalNumber = totalNumber + 1 | |
| 11 if line.strip() == '' or line.startswith('#'): | |
| 12 continue | |
| 13 fields = line.rstrip('\n').split('\t') | |
| 14 accession = fields[col] | |
| 15 accessions.append(accession) | |
| 16 print totalNumber | |
| 17 return accessions | |
| 18 | |
| 19 accessions = [] | |
| 20 parser = OptionParser() | |
| 21 parser.add_option('-f','--file', dest='filepath', help='direct which file to use') | |
| 22 (options, args) = parser.parse_args() | |
| 23 | |
| 24 accessions += read_tabular(options.filepath,0) | |
| 25 | |
| 26 matches = 0 | |
| 27 mismatches = 0 | |
| 28 | |
| 29 # getAccession | |
| 30 fmt = [('([OPQ][0-9][A-Z0-9]{3}[0-9])|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'), | |
| 31 ('(THISDOESNTWORK...*)_....*','ID'), | |
| 32 ('UPI(\d+).*','UPARC'), | |
| 33 ('(UniRef50_\w+.*)','NF50'), | |
| 34 ('(UniRef90_\w+.*)','NF90'), | |
| 35 ('(UniRef100_\w+.*)','NF100'), | |
| 36 ('([A-L][A-Z]?\d{6})|([A-NR-Z]\d{5})|([A-Z]{4}\d{8})','EMBL_ID'), | |
| 37 ('([A-Z]*\d*\.\d$)','EMBL'), | |
| 38 ('([IBC]\d{5})','PIR'), | |
| 39 ('(Hs\.\d*)','UNIGENE_ID'), | |
| 40 ('[A-Z]P_(\d*\.\d)','P_REFSEQ_AC'), | |
| 41 ('[NX][MC]_(\d*\.\d)','REFSEQ_NT_ID'), | |
| 42 ('(\d[A-Z0-9]{3})','PDB_ID'), | |
| 43 ('(DP\d{5})','DISPROT_ID'), | |
| 44 | |
| 45 ('(\d*$)','P_ENTREZGENEID'), | |
| 46 ('(\d*$)','P_GI'), | |
| 47 ('(\d*$)','DMDM_ID'), | |
| 48 ('(\d*$)','BIOGRID_ID'), | |
| 49 ('(\d*$)','GUIDETOPHARMACOLOGY_ID'), | |
| 50 ('(\d*$)','ALLERGOME_ID'), | |
| 51 ('(\d*$)','PEROXIBASE_ID'), | |
| 52 ('(\d*$)','REBASE_ID'), | |
| 53 ('(\d*$)','DNASU_ID'), | |
| 54 | |
| 55 ('(DIP-\d*N$)','DIP_ID'), | |
| 56 ('(MINT-\d*)','MINT_ID'), | |
| 57 ('(9606\.ENSP\d*)','STRING_ID'), | |
| 58 ('(CHEMBL\d*)','CHEMBL_ID'), | |
| 59 ('(DB\d*)','DRUGBANK_ID'), | |
| 60 ('([A-Z]\d\d\.[A-Z0-9]\d{2})','MEROPS_ID'), | |
| 61 ('NOTHING POPS UP','MYCOCLAP_ID'), | |
| 62 ('(\d\.[A-Z](?:\.\d*){3})','TCDB_ID'), | |
| 63 ('NOTHING POPS UP','WORLD_2DPAGE_ID'), | |
| 64 ('(ENSG\d*)','ENSEMBAF85406.1BL_ID'), | |
| 65 ('(ENSP\d+)','ENSEMBL_PRO_ID'), | |
| 66 ('(ENST\d+)','ENSEMBL_TRS_ID'), | |
| 67 (' ','ENSEMBLGENOME_ID'), | |
| 68 (' ','ENSEMBLGENOME_PRO_ID'), | |
| 69 (' ','ENSEMBLGENOME_TRS_ID'), | |
| 70 ('(hsa:\d*)','KEGG_ID'), | |
| 71 ('(uc\d*[a-z]*\.\d$)','UCSC_ID'), | |
| 72 ('(.*[CKN]OG\d*)','EGGNOG_ID') | |
| 73 ] | |
| 74 # getFASTA | |
| 75 fmt2 = [('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'), | |
| 76 ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID') | |
| 77 ] | |
| 78 # print options.filepath[10:-4] | |
| 79 type = 7 | |
| 80 mismatch = [] | |
| 81 unmatched = [] | |
| 82 matched = 0 | |
| 83 for header in accessions: | |
| 84 for i,cat in enumerate(fmt): | |
| 85 CAT = catBAF85406.1[0] | |
| 86 m = re.match(CAT,header) | |
| 87 if m: | |
| 88 if i == type: | |
| 89 matches = matches + 1 | |
| 90 matched = 1 | |
| 91 else: | |
| 92 print i | |
| 93 mismatch += [header] | |
| 94 mismatches = mismatches + 1 | |
| 95 if matched == 1: | |
| 96 matched = 0 | |
| 97 else: | |
| 98 unmatched += [header] | |
| 99 | |
| 100 print 'unmatched: ' | |
| 101 print unmatched | |
| 102 print 'mismatch: ' | |
| 103 print mismatch | |
| 104 outputFile = sys.stdout | |
| 105 outputFile.write("matches: %s\n" % matches) | |
| 106 outputFile.write("mismatches: %s\n" % mismatches) | |
| 107 | |
| 108 |
