Mercurial > repos > rsajulga > uniprot_id_mapper
diff uniprot_ID_mapper/test.py @ 3:6651ac4651f0 draft default tip
Uploaded
| author | rsajulga |
|---|---|
| date | Thu, 16 Jul 2015 16:41:35 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/uniprot_ID_mapper/test.py Thu Jul 16 16:41:35 2015 -0400 @@ -0,0 +1,108 @@ +import re +import sys +from optparse import OptionParser + +def read_tabular(filepath,col): + accessions = [] + totalNumber = 0 + with open(filepath) as fp: + for i,line in enumerate(fp): + totalNumber = totalNumber + 1 + if line.strip() == '' or line.startswith('#'): + continue + fields = line.rstrip('\n').split('\t') + accession = fields[col] + accessions.append(accession) + print totalNumber + return accessions + +accessions = [] +parser = OptionParser() +parser.add_option('-f','--file', dest='filepath', help='direct which file to use') +(options, args) = parser.parse_args() + +accessions += read_tabular(options.filepath,0) + +matches = 0 +mismatches = 0 + +# getAccession +fmt = [('([OPQ][0-9][A-Z0-9]{3}[0-9])|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})','ACC'), + ('(THISDOESNTWORK...*)_....*','ID'), + ('UPI(\d+).*','UPARC'), + ('(UniRef50_\w+.*)','NF50'), + ('(UniRef90_\w+.*)','NF90'), + ('(UniRef100_\w+.*)','NF100'), + ('([A-L][A-Z]?\d{6})|([A-NR-Z]\d{5})|([A-Z]{4}\d{8})','EMBL_ID'), + ('([A-Z]*\d*\.\d$)','EMBL'), + ('([IBC]\d{5})','PIR'), + ('(Hs\.\d*)','UNIGENE_ID'), + ('[A-Z]P_(\d*\.\d)','P_REFSEQ_AC'), + ('[NX][MC]_(\d*\.\d)','REFSEQ_NT_ID'), + ('(\d[A-Z0-9]{3})','PDB_ID'), + ('(DP\d{5})','DISPROT_ID'), + + ('(\d*$)','P_ENTREZGENEID'), + ('(\d*$)','P_GI'), + ('(\d*$)','DMDM_ID'), + ('(\d*$)','BIOGRID_ID'), + ('(\d*$)','GUIDETOPHARMACOLOGY_ID'), + ('(\d*$)','ALLERGOME_ID'), + ('(\d*$)','PEROXIBASE_ID'), + ('(\d*$)','REBASE_ID'), + ('(\d*$)','DNASU_ID'), + + ('(DIP-\d*N$)','DIP_ID'), + ('(MINT-\d*)','MINT_ID'), + ('(9606\.ENSP\d*)','STRING_ID'), + ('(CHEMBL\d*)','CHEMBL_ID'), + ('(DB\d*)','DRUGBANK_ID'), + ('([A-Z]\d\d\.[A-Z0-9]\d{2})','MEROPS_ID'), + ('NOTHING POPS UP','MYCOCLAP_ID'), + ('(\d\.[A-Z](?:\.\d*){3})','TCDB_ID'), + ('NOTHING POPS UP','WORLD_2DPAGE_ID'), + ('(ENSG\d*)','ENSEMBAF85406.1BL_ID'), + ('(ENSP\d+)','ENSEMBL_PRO_ID'), + ('(ENST\d+)','ENSEMBL_TRS_ID'), + (' ','ENSEMBLGENOME_ID'), + (' ','ENSEMBLGENOME_PRO_ID'), + (' ','ENSEMBLGENOME_TRS_ID'), + ('(hsa:\d*)','KEGG_ID'), + ('(uc\d*[a-z]*\.\d$)','UCSC_ID'), + ('(.*[CKN]OG\d*)','EGGNOG_ID') +] +# getFASTA +fmt2 = [('>?(?:sp|tr|sw)\|(\w+).*','ACC+ID'), + ('>?gi|\d+\|ref\|(NP_\d+\.\d+).*','ACC+ID') +] +# print options.filepath[10:-4] +type = 7 +mismatch = [] +unmatched = [] +matched = 0 +for header in accessions: + for i,cat in enumerate(fmt): + CAT = catBAF85406.1[0] + m = re.match(CAT,header) + if m: + if i == type: + matches = matches + 1 + matched = 1 + else: + print i + mismatch += [header] + mismatches = mismatches + 1 + if matched == 1: + matched = 0 + else: + unmatched += [header] + +print 'unmatched: ' +print unmatched +print 'mismatch: ' +print mismatch +outputFile = sys.stdout +outputFile.write("matches: %s\n" % matches) +outputFile.write("mismatches: %s\n" % mismatches) + + \ No newline at end of file
