| 3 | 1 import re | 
|  | 2 import sys | 
|  | 3 from optparse import OptionParser | 
|  | 4 | 
|  | 5 def read_tabular(filepath,col): | 
|  | 6     accessions = [] | 
|  | 7     totalNumber = 0 | 
|  | 8     with open(filepath) as fp: | 
|  | 9       for i,line in enumerate(fp): | 
|  | 10 	totalNumber = totalNumber + 1 | 
|  | 11 	if line.strip() == '' or line.startswith('#'): | 
|  | 12           continue | 
|  | 13         fields = line.rstrip('\n').split('\t') | 
|  | 14         accession = fields[col] | 
|  | 15         accessions.append(accession) | 
|  | 16     print totalNumber | 
|  | 17     return accessions | 
|  | 18 | 
|  | 19 accessions = [] | 
|  | 20 parser = OptionParser() | 
|  | 21 parser.add_option('-f','--file', dest='filepath', help='direct which file to use') | 
|  | 22 (options, args) = parser.parse_args() | 
|  | 23 | 
|  | 24 accessions += read_tabular(options.filepath,0) | 
|  | 25 | 
|  | 26 matches = 0 | 
|  | 27 mismatches = 0 | 
|  | 28 | 
|  | 29 # getAccession | 
|  | 30 fmt = [('([OPQ][0-9][A-Z0-9]{3}[0-9])|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'), | 
|  | 31       ('(THISDOESNTWORK...*)_....*','ID'), | 
|  | 32       ('UPI(\d+).*','UPARC'), | 
|  | 33       ('(UniRef50_\w+.*)','NF50'), | 
|  | 34       ('(UniRef90_\w+.*)','NF90'), | 
|  | 35       ('(UniRef100_\w+.*)','NF100'), | 
|  | 36       ('([A-L][A-Z]?\d{6})|([A-NR-Z]\d{5})|([A-Z]{4}\d{8})','EMBL_ID'), | 
|  | 37       ('([A-Z]*\d*\.\d$)','EMBL'), | 
|  | 38       ('([IBC]\d{5})','PIR'), | 
|  | 39       ('(Hs\.\d*)','UNIGENE_ID'), | 
|  | 40       ('[A-Z]P_(\d*\.\d)','P_REFSEQ_AC'), | 
|  | 41       ('[NX][MC]_(\d*\.\d)','REFSEQ_NT_ID'), | 
|  | 42       ('(\d[A-Z0-9]{3})','PDB_ID'), | 
|  | 43       ('(DP\d{5})','DISPROT_ID'), | 
|  | 44 | 
|  | 45       ('(\d*$)','P_ENTREZGENEID'), | 
|  | 46       ('(\d*$)','P_GI'), | 
|  | 47       ('(\d*$)','DMDM_ID'), | 
|  | 48       ('(\d*$)','BIOGRID_ID'), | 
|  | 49       ('(\d*$)','GUIDETOPHARMACOLOGY_ID'), | 
|  | 50       ('(\d*$)','ALLERGOME_ID'), | 
|  | 51       ('(\d*$)','PEROXIBASE_ID'), | 
|  | 52       ('(\d*$)','REBASE_ID'), | 
|  | 53       ('(\d*$)','DNASU_ID'), | 
|  | 54 | 
|  | 55       ('(DIP-\d*N$)','DIP_ID'), | 
|  | 56       ('(MINT-\d*)','MINT_ID'), | 
|  | 57       ('(9606\.ENSP\d*)','STRING_ID'), | 
|  | 58       ('(CHEMBL\d*)','CHEMBL_ID'), | 
|  | 59       ('(DB\d*)','DRUGBANK_ID'), | 
|  | 60       ('([A-Z]\d\d\.[A-Z0-9]\d{2})','MEROPS_ID'), | 
|  | 61       ('NOTHING POPS UP','MYCOCLAP_ID'), | 
|  | 62       ('(\d\.[A-Z](?:\.\d*){3})','TCDB_ID'), | 
|  | 63       ('NOTHING POPS UP','WORLD_2DPAGE_ID'), | 
|  | 64       ('(ENSG\d*)','ENSEMBAF85406.1BL_ID'), | 
|  | 65       ('(ENSP\d+)','ENSEMBL_PRO_ID'), | 
|  | 66       ('(ENST\d+)','ENSEMBL_TRS_ID'), | 
|  | 67       (' ','ENSEMBLGENOME_ID'), | 
|  | 68       (' ','ENSEMBLGENOME_PRO_ID'), | 
|  | 69       (' ','ENSEMBLGENOME_TRS_ID'), | 
|  | 70       ('(hsa:\d*)','KEGG_ID'), | 
|  | 71       ('(uc\d*[a-z]*\.\d$)','UCSC_ID'), | 
|  | 72       ('(.*[CKN]OG\d*)','EGGNOG_ID') | 
|  | 73 ] | 
|  | 74 # getFASTA | 
|  | 75 fmt2 = [('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'), | 
|  | 76       ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID') | 
|  | 77 ] | 
|  | 78 # print options.filepath[10:-4] | 
|  | 79 type = 7 | 
|  | 80 mismatch = [] | 
|  | 81 unmatched = [] | 
|  | 82 matched = 0 | 
|  | 83 for header in accessions: | 
|  | 84    	for i,cat in enumerate(fmt): | 
|  | 85 		CAT = catBAF85406.1[0] | 
|  | 86 		m = re.match(CAT,header) | 
|  | 87    	    	if m: | 
|  | 88 			if i == type: | 
|  | 89    	    			matches = matches + 1 | 
|  | 90 				matched = 1 | 
|  | 91 			else: | 
|  | 92 				print i | 
|  | 93 				mismatch += [header] | 
|  | 94 				mismatches = mismatches + 1 | 
|  | 95 	if matched == 1: | 
|  | 96 		matched = 0 | 
|  | 97 	else: | 
|  | 98 		unmatched += [header] | 
|  | 99 | 
|  | 100 print 'unmatched: ' | 
|  | 101 print unmatched | 
|  | 102 print 'mismatch: ' | 
|  | 103 print mismatch | 
|  | 104 outputFile = sys.stdout | 
|  | 105 outputFile.write("matches: %s\n" % matches) | 
|  | 106 outputFile.write("mismatches: %s\n" % mismatches) | 
|  | 107 | 
|  | 108 |