Mercurial > repos > rsajulga > uniprot_id_mapper
view uniprot_ID_mapper/test.py @ 3:6651ac4651f0 draft default tip
Uploaded
| author | rsajulga |
|---|---|
| date | Thu, 16 Jul 2015 16:41:35 -0400 |
| parents | |
| children |
line wrap: on
line source
import re import sys from optparse import OptionParser def read_tabular(filepath,col): accessions = [] totalNumber = 0 with open(filepath) as fp: for i,line in enumerate(fp): totalNumber = totalNumber + 1 if line.strip() == '' or line.startswith('#'): continue fields = line.rstrip('\n').split('\t') accession = fields[col] accessions.append(accession) print totalNumber return accessions accessions = [] parser = OptionParser() parser.add_option('-f','--file', dest='filepath', help='direct which file to use') (options, args) = parser.parse_args() accessions += read_tabular(options.filepath,0) matches = 0 mismatches = 0 # getAccession fmt = [('([OPQ][0-9][A-Z0-9]{3}[0-9])|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'), ('(THISDOESNTWORK...*)_....*','ID'), ('UPI(\d+).*','UPARC'), ('(UniRef50_\w+.*)','NF50'), ('(UniRef90_\w+.*)','NF90'), ('(UniRef100_\w+.*)','NF100'), ('([A-L][A-Z]?\d{6})|([A-NR-Z]\d{5})|([A-Z]{4}\d{8})','EMBL_ID'), ('([A-Z]*\d*\.\d$)','EMBL'), ('([IBC]\d{5})','PIR'), ('(Hs\.\d*)','UNIGENE_ID'), ('[A-Z]P_(\d*\.\d)','P_REFSEQ_AC'), ('[NX][MC]_(\d*\.\d)','REFSEQ_NT_ID'), ('(\d[A-Z0-9]{3})','PDB_ID'), ('(DP\d{5})','DISPROT_ID'), ('(\d*$)','P_ENTREZGENEID'), ('(\d*$)','P_GI'), ('(\d*$)','DMDM_ID'), ('(\d*$)','BIOGRID_ID'), ('(\d*$)','GUIDETOPHARMACOLOGY_ID'), ('(\d*$)','ALLERGOME_ID'), ('(\d*$)','PEROXIBASE_ID'), ('(\d*$)','REBASE_ID'), ('(\d*$)','DNASU_ID'), ('(DIP-\d*N$)','DIP_ID'), ('(MINT-\d*)','MINT_ID'), ('(9606\.ENSP\d*)','STRING_ID'), ('(CHEMBL\d*)','CHEMBL_ID'), ('(DB\d*)','DRUGBANK_ID'), ('([A-Z]\d\d\.[A-Z0-9]\d{2})','MEROPS_ID'), ('NOTHING POPS UP','MYCOCLAP_ID'), ('(\d\.[A-Z](?:\.\d*){3})','TCDB_ID'), ('NOTHING POPS UP','WORLD_2DPAGE_ID'), ('(ENSG\d*)','ENSEMBAF85406.1BL_ID'), ('(ENSP\d+)','ENSEMBL_PRO_ID'), ('(ENST\d+)','ENSEMBL_TRS_ID'), (' ','ENSEMBLGENOME_ID'), (' ','ENSEMBLGENOME_PRO_ID'), (' ','ENSEMBLGENOME_TRS_ID'), ('(hsa:\d*)','KEGG_ID'), ('(uc\d*[a-z]*\.\d$)','UCSC_ID'), ('(.*[CKN]OG\d*)','EGGNOG_ID') ] # getFASTA fmt2 = [('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'), ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID') ] # print options.filepath[10:-4] type = 7 mismatch = [] unmatched = [] matched = 0 for header in accessions: for i,cat in enumerate(fmt): CAT = catBAF85406.1[0] m = re.match(CAT,header) if m: if i == type: matches = matches + 1 matched = 1 else: print i mismatch += [header] mismatches = mismatches + 1 if matched == 1: matched = 0 else: unmatched += [header] print 'unmatched: ' print unmatched print 'mismatch: ' print mismatch outputFile = sys.stdout outputFile.write("matches: %s\n" % matches) outputFile.write("mismatches: %s\n" % mismatches)
